fix(misc): three small defensive fixes from PR #1974

Salvages the three substantive low-severity fixes from Gutslabs' #1974 "misc bug fixes" bundle. The other 8 claims in that PR were either already fixed on main with superior implementations (state lock, firecrawl lazy import, fcntl/msvcrt guard, path normalization, schema migrations) or did not survive review. - run_agent: `_materialize_data_url_for_vision` uses `NamedTemporaryFile(delete=False)`; if `base64.b64decode` raises on a corrupt data URL the temp file would persist forever. Wrap the write in try/except and `os.unlink` the temp on failure. - gateway/session: `append_to_transcript` JSONL write had no error handling, so disk-full / read-only-fs / permission errors crashed the message handler. The SQLite write above is the primary store, so swallow OSError on the JSONL fallback with a debug log. - gateway/status: `_read_pid_record` reads `pid_path.read_text()` after an `exists()` check; if the PID file is deleted between the two calls (concurrent gateway restart) we hit an unhandled OSError. Catch it and return None. Adds a regression test for the tempfile cleanup; the other two paths are defensive try/excepts on infrequent OSError that don't warrant dedicated tests. Co-authored-by: Teknium <127238744+teknium1@users.noreply.github.com>
2026-05-18 04:41:56 +00:00 · 2026-05-10 22:20:25 -07:00 · 2026-05-10 22:20:25 -07:00 · 3af3c4eb8c
commit 3af3c4eb8c
parent 482d49cf90
4 changed files with 78 additions and 6 deletions
--- a/gateway/session.py
+++ b/gateway/session.py
@ -1276,9 +1276,14 @@ class SessionStore:
        
        # Also write legacy JSONL (keeps existing tooling working during transition)
        transcript_path = self.get_transcript_path(session_id)
-        with self._lock:
-            with open(transcript_path, "a", encoding="utf-8") as f:
-                f.write(json.dumps(message, ensure_ascii=False) + "\n")
+        try:
+            with self._lock:
+                with open(transcript_path, "a", encoding="utf-8") as f:
+                    f.write(json.dumps(message, ensure_ascii=False) + "\n")
+        except OSError as e:
+            # Disk full / read-only fs / permission errors must not crash the
+            # message handler — the SQLite write above is the primary store.
+            logger.debug("Failed to write JSONL transcript for %s: %s", session_id, e)
    
    def rewrite_transcript(self, session_id: str, messages: List[Dict[str, Any]]) -> None:
        """Replace the entire transcript for a session with new messages.
--- a/gateway/status.py
+++ b/gateway/status.py
@ -218,7 +218,11 @@ def _read_pid_record(pid_path: Optional[Path] = None) -> Optional[dict]:
    if not pid_path.exists():
        return None

-    raw = pid_path.read_text().strip()
+    try:
+        raw = pid_path.read_text().strip()
+    except OSError:
+        # File was deleted between exists() and read_text(), or permission flipped.
+        return None
    if not raw:
        return None

--- a/run_agent.py
+++ b/run_agent.py
@ -8793,8 +8793,17 @@ class AIAgent:
            "image/jpg": ".jpg",
        }.get(mime, ".jpg")
        tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
-        with tmp:
-            tmp.write(base64.b64decode(data))
+        try:
+            with tmp:
+                tmp.write(base64.b64decode(data))
+        except Exception:
+            # delete=False means a corrupt/unsupported data URL would otherwise
+            # leak a zero-byte temp file on every failed materialization.
+            try:
+                os.unlink(tmp.name)
+            except OSError:
+                pass
+            raise
        path = Path(tmp.name)
        return str(path), path

--- a/tests/run_agent/test_materialize_data_url_cleanup.py
+++ b/tests/run_agent/test_materialize_data_url_cleanup.py
@ -0,0 +1,54 @@
+"""Regression test: temp file cleanup when materializing data URLs for vision.
+
+`_materialize_data_url_for_vision` creates a `NamedTemporaryFile(delete=False)`
+so the path can be handed to vision backends.  If `base64.b64decode` raises on
+a corrupt/unsupported data URL the temp file would otherwise persist forever
+on disk, leaking once per failed call.
+"""
+
+from __future__ import annotations
+
+import base64
+import os
+import tempfile
+from pathlib import Path
+
+import pytest
+
+from run_agent import AIAgent
+
+
+def _list_anthropic_tmpfiles(tmpdir: str) -> list[str]:
+    return [
+        name for name in os.listdir(tmpdir)
+        if name.startswith("anthropic_image_")
+    ]
+
+
+def test_b64decode_failure_does_not_leak_tempfile(monkeypatch, tmp_path):
+    monkeypatch.setattr(tempfile, "tempdir", str(tmp_path))
+
+    bad_url = "data:image/png;base64,!!!not-valid-base64!!!"
+    with pytest.raises(Exception):
+        AIAgent._materialize_data_url_for_vision(bad_url)
+
+    leftovers = _list_anthropic_tmpfiles(str(tmp_path))
+    assert leftovers == [], f"leaked temp files after decode failure: {leftovers}"
+
+
+def test_successful_decode_returns_path_to_existing_file(monkeypatch, tmp_path):
+    monkeypatch.setattr(tempfile, "tempdir", str(tmp_path))
+
+    payload = b"\x89PNG\r\n\x1a\n" + b"\x00" * 16  # a few bytes is enough
+    encoded = base64.b64encode(payload).decode("ascii")
+    good_url = f"data:image/png;base64,{encoded}"
+
+    path_str, path_obj = AIAgent._materialize_data_url_for_vision(good_url)
+
+    assert isinstance(path_obj, Path)
+    assert path_obj.exists()
+    assert path_obj.read_bytes() == payload
+    assert path_str == str(path_obj)
+    # Caller is responsible for cleanup; mimic that here so the test leaves
+    # no artifacts behind.
+    path_obj.unlink()