From 69a293b419393c1c560ab9dab43b4d66e0e31230 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sun, 7 Jun 2026 17:52:45 -0700 Subject: [PATCH] hardening(todo): bound TodoStore item content length and count MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The todo list is re-injected into the model's context after every context-compression event (TodoStore.format_for_injection), so an oversized todo item or an unbounded number of items defeats the compression it is meant to ride through. TodoStore.write/_validate previously enforced no size or count bounds, so a single 50KB item produced a ~50KB re-injection block on every subsequent turn. Add two caps: - MAX_TODO_CONTENT_CHARS (4000): per-item content is truncated with a marker. Routed through a shared _cap_content() so the merge-update path (which writes content directly, bypassing _validate) is capped too. - MAX_TODO_ITEMS (256): total list length is bounded, keeping the highest-priority head (list order is priority). Both caps are generous relative to real plans — a todo item is a short task description and active lists are a handful of items. NOT a security fix. Raised externally via GHSA-5g4g-6jrg-mw3g, which framed a caller-supplied conversation_history on the authenticated API server replaying into _hydrate_todo_store as a DoS. That path is authenticated (the API server refuses to start without API_SERVER_KEY) and self-scoped (the caller supplies their own entire history and can only inflate their own response chain — forged role=tool entries are never persisted to the session DB), so it is out of scope as a vulnerability under SECURITY.md 3.2. These bounds are footgun containment that also applies to the trusted agent path, where the model itself authors the todos. Credit to the reporter for the observation. Co-authored-by: YLChen-007 <30854794+YLChen-007@users.noreply.github.com> --- tests/tools/test_todo_tool.py | 58 +++++++++++++++++++++++++++++++++++ tools/todo_tool.py | 33 +++++++++++++++++++- 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/tests/tools/test_todo_tool.py b/tests/tools/test_todo_tool.py index 6215078525c..dbb64e80ee6 100644 --- a/tests/tools/test_todo_tool.py +++ b/tests/tools/test_todo_tool.py @@ -117,3 +117,61 @@ class TestTodoToolFunction: def test_no_store_returns_error(self): result = json.loads(todo_tool()) assert "error" in result + + +class TestTodoStoreBounds: + """Bounds on persisted todo state (GHSA-5g4g-6jrg-mw3g hardening). + + The todo list is re-injected into context after every compression event, + so an unbounded item — whether authored by the model or replayed from + caller-supplied history on the API server's _hydrate_todo_store path — + would defeat the compression it rides through. These pin the caps. + Not a security boundary (the API surface is authenticated and the caller + supplies their own history); this is footgun containment / parity. + """ + + def test_oversized_content_is_truncated(self): + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}]) + item = store.read()[0] + assert len(item["content"]) <= MAX_TODO_CONTENT_CHARS + assert item["content"].endswith("… [truncated]") + + def test_injection_block_is_bounded(self): + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}]) + inj = store.format_for_injection() + # Before the fix this was ~50085 chars; now it tracks the cap. + assert len(inj) < MAX_TODO_CONTENT_CHARS + 200 + + def test_merge_update_content_is_capped(self): + """The merge path updates content directly, bypassing _validate — + verify it is capped too.""" + from tools.todo_tool import MAX_TODO_CONTENT_CHARS + store = TodoStore() + store.write([{"id": "1", "content": "short", "status": "pending"}]) + store.write([{"id": "1", "content": "B" * 50001}], merge=True) + assert len(store.read()[0]["content"]) <= MAX_TODO_CONTENT_CHARS + + def test_item_count_is_bounded(self): + from tools.todo_tool import MAX_TODO_ITEMS + store = TodoStore() + store.write([ + {"id": str(i), "content": f"task {i}", "status": "pending"} + for i in range(5000) + ]) + assert len(store.read()) == MAX_TODO_ITEMS + + def test_normal_list_is_unchanged(self): + """No regression: ordinary plans pass through untouched (no marker, + same content, same order).""" + store = TodoStore() + store.write([ + {"id": "1", "content": "write the report", "status": "in_progress"}, + {"id": "2", "content": "review PR", "status": "pending"}, + ]) + items = store.read() + assert [i["content"] for i in items] == ["write the report", "review PR"] + assert "[truncated]" not in items[0]["content"] diff --git a/tools/todo_tool.py b/tools/todo_tool.py index 99d9ffe8515..960dab66603 100644 --- a/tools/todo_tool.py +++ b/tools/todo_tool.py @@ -21,6 +21,17 @@ from typing import Dict, Any, List, Optional # Valid status values for todo items VALID_STATUSES = {"pending", "in_progress", "completed", "cancelled"} +# Bounds on persisted todo state. The todo list is a planning aid the model +# re-reads after every context-compression event (see format_for_injection), +# so unbounded item content or count defeats the compression it rides through. +# These caps keep a single oversized item (whether authored by the model or +# replayed from caller-supplied history on the API server) from inflating the +# re-injection block. Generous relative to real plans — a todo item is a short +# task description, and active lists are a handful of items, not hundreds. +MAX_TODO_CONTENT_CHARS = 4000 +MAX_TODO_ITEMS = 256 +_TRUNCATION_MARKER = "… [truncated]" + class TodoStore: """ @@ -58,7 +69,7 @@ class TodoStore: if item_id in existing: # Update only the fields the LLM actually provided if "content" in t and t["content"]: - existing[item_id]["content"] = str(t["content"]).strip() + existing[item_id]["content"] = self._cap_content(str(t["content"]).strip()) if "status" in t and t["status"]: status = str(t["status"]).strip().lower() if status in VALID_STATUSES: @@ -77,6 +88,11 @@ class TodoStore: rebuilt.append(current) seen.add(current["id"]) self._items = rebuilt + # Bound total item count so a replayed/oversized list can't grow the + # re-injection block without limit. Keep the highest-priority head + # (list order is priority). + if len(self._items) > MAX_TODO_ITEMS: + self._items = self._items[:MAX_TODO_ITEMS] return self.read() def read(self) -> List[Dict[str, str]]: @@ -121,6 +137,19 @@ class TodoStore: return "\n".join(lines) + @staticmethod + def _cap_content(content: str) -> str: + """Truncate oversized todo content to MAX_TODO_CONTENT_CHARS. + + A single huge item would otherwise inflate the post-compression + re-injection block (format_for_injection) without bound. Keep the + head — the actionable part of a task description — plus a marker. + """ + if len(content) > MAX_TODO_CONTENT_CHARS: + keep = MAX_TODO_CONTENT_CHARS - len(_TRUNCATION_MARKER) + return content[:keep] + _TRUNCATION_MARKER + return content + @staticmethod def _validate(item: Dict[str, Any]) -> Dict[str, str]: """ @@ -136,6 +165,8 @@ class TodoStore: content = str(item.get("content", "")).strip() if not content: content = "(no description)" + else: + content = TodoStore._cap_content(content) status = str(item.get("status", "pending")).strip().lower() if status not in VALID_STATUSES: