diff --git a/tests/tools/test_todo_tool.py b/tests/tools/test_todo_tool.py index 6215078525c..dbb64e80ee6 100644 --- a/tests/tools/test_todo_tool.py +++ b/tests/tools/test_todo_tool.py @@ -117,3 +117,61 @@ class TestTodoToolFunction: def test_no_store_returns_error(self): result = json.loads(todo_tool()) assert "error" in result + + +class TestTodoStoreBounds: + """Bounds on persisted todo state (GHSA-5g4g-6jrg-mw3g hardening). + + The todo list is re-injected into context after every compression event, + so an unbounded item — whether authored by the model or replayed from + caller-supplied history on the API server's _hydrate_todo_store path — + would defeat the compression it rides through. These pin the caps. + Not a security boundary (the API surface is authenticated and the caller + supplies their own history); this is footgun containment / parity. + """ + + def test_oversized_content_is_truncated(self): + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}]) + item = store.read()[0] + assert len(item["content"]) <= MAX_TODO_CONTENT_CHARS + assert item["content"].endswith("… [truncated]") + + def test_injection_block_is_bounded(self): + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}]) + inj = store.format_for_injection() + # Before the fix this was ~50085 chars; now it tracks the cap. + assert len(inj) < MAX_TODO_CONTENT_CHARS + 200 + + def test_merge_update_content_is_capped(self): + """The merge path updates content directly, bypassing _validate — + verify it is capped too.""" + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "short", "status": "pending"}]) + store.write([{"id": "1", "content": "B" * 50001}], merge=True) + assert len(store.read()[0]["content"]) <= MAX_TODO_CONTENT_CHARS + + def test_item_count_is_bounded(self): + from tools.todo_tool import MAX_TODO_ITEMS + store = TodoStore() + store.write([ + {"id": str(i), "content": f"task {i}", "status": "pending"} + for i in range(5000) + ]) + assert len(store.read()) == MAX_TODO_ITEMS + + def test_normal_list_is_unchanged(self): + """No regression: ordinary plans pass through untouched (no marker, + same content, same order).""" + store = TodoStore() + store.write([ + {"id": "1", "content": "write the report", "status": "in_progress"}, + {"id": "2", "content": "review PR", "status": "pending"}, + ]) + items = store.read() + assert [i["content"] for i in items] == ["write the report", "review PR"] + assert "[truncated]" not in items[0]["content"] diff --git a/tools/todo_tool.py b/tools/todo_tool.py index 99d9ffe8515..960dab66603 100644 --- a/tools/todo_tool.py +++ b/tools/todo_tool.py @@ -21,6 +21,17 @@ from typing import Dict, Any, List, Optional # Valid status values for todo items VALID_STATUSES = {"pending", "in_progress", "completed", "cancelled"} +# Bounds on persisted todo state. The todo list is a planning aid the model +# re-reads after every context-compression event (see format_for_injection), +# so unbounded item content or count defeats the compression it rides through. +# These caps keep a single oversized item (whether authored by the model or +# replayed from caller-supplied history on the API server) from inflating the +# re-injection block. Generous relative to real plans — a todo item is a short +# task description, and active lists are a handful of items, not hundreds. +MAX_TODO_CONTENT_CHARS = 4000 +MAX_TODO_ITEMS = 256 +_TRUNCATION_MARKER = "… [truncated]" + class TodoStore: """ @@ -58,7 +69,7 @@ class TodoStore: if item_id in existing: # Update only the fields the LLM actually provided if "content" in t and t["content"]: - existing[item_id]["content"] = str(t["content"]).strip() + existing[item_id]["content"] = self._cap_content(str(t["content"]).strip()) if "status" in t and t["status"]: status = str(t["status"]).strip().lower() if status in VALID_STATUSES: @@ -77,6 +88,11 @@ class TodoStore: rebuilt.append(current) seen.add(current["id"]) self._items = rebuilt + # Bound total item count so a replayed/oversized list can't grow the + # re-injection block without limit. Keep the highest-priority head + # (list order is priority). + if len(self._items) > MAX_TODO_ITEMS: + self._items = self._items[:MAX_TODO_ITEMS] return self.read() def read(self) -> List[Dict[str, str]]: @@ -121,6 +137,19 @@ class TodoStore: return "\n".join(lines) + @staticmethod + def _cap_content(content: str) -> str: + """Truncate oversized todo content to MAX_TODO_CONTENT_CHARS. + + A single huge item would otherwise inflate the post-compression + re-injection block (format_for_injection) without bound. Keep the + head — the actionable part of a task description — plus a marker. + """ + if len(content) > MAX_TODO_CONTENT_CHARS: + keep = MAX_TODO_CONTENT_CHARS - len(_TRUNCATION_MARKER) + return content[:keep] + _TRUNCATION_MARKER + return content + @staticmethod def _validate(item: Dict[str, Any]) -> Dict[str, str]: """ @@ -136,6 +165,8 @@ class TodoStore: content = str(item.get("content", "")).strip() if not content: content = "(no description)" + else: + content = TodoStore._cap_content(content) status = str(item.get("status", "pending")).strip().lower() if status not in VALID_STATUSES: