hermes-agent/tests/tools/test_todo_tool.py
Teknium 69a293b419 hardening(todo): bound TodoStore item content length and count
The todo list is re-injected into the model's context after every
context-compression event (TodoStore.format_for_injection), so an oversized
todo item or an unbounded number of items defeats the compression it is meant
to ride through. TodoStore.write/_validate previously enforced no size or count
bounds, so a single 50KB item produced a ~50KB re-injection block on every
subsequent turn.

Add two caps:
- MAX_TODO_CONTENT_CHARS (4000): per-item content is truncated with a marker.
  Routed through a shared _cap_content() so the merge-update path (which writes
  content directly, bypassing _validate) is capped too.
- MAX_TODO_ITEMS (256): total list length is bounded, keeping the
  highest-priority head (list order is priority).

Both caps are generous relative to real plans — a todo item is a short task
description and active lists are a handful of items.

NOT a security fix. Raised externally via GHSA-5g4g-6jrg-mw3g, which framed a
caller-supplied conversation_history on the authenticated API server replaying
into _hydrate_todo_store as a DoS. That path is authenticated (the API server
refuses to start without API_SERVER_KEY) and self-scoped (the caller supplies
their own entire history and can only inflate their own response chain — forged
role=tool entries are never persisted to the session DB), so it is out of scope
as a vulnerability under SECURITY.md 3.2. These bounds are footgun containment
that also applies to the trusted agent path, where the model itself authors the
todos. Credit to the reporter for the observation.

Co-authored-by: YLChen-007 <30854794+YLChen-007@users.noreply.github.com>
2026-06-07 18:06:27 -07:00

177 lines
6.4 KiB
Python

"""Tests for the todo tool module."""
import json
from tools.todo_tool import TodoStore, todo_tool
class TestWriteAndRead:
def test_write_replaces_list(self):
store = TodoStore()
items = [
{"id": "1", "content": "First task", "status": "pending"},
{"id": "2", "content": "Second task", "status": "in_progress"},
]
result = store.write(items)
assert len(result) == 2
assert result[0]["id"] == "1"
assert result[1]["status"] == "in_progress"
def test_read_returns_copy(self):
store = TodoStore()
store.write([{"id": "1", "content": "Task", "status": "pending"}])
items = store.read()
items[0]["content"] = "MUTATED"
assert store.read()[0]["content"] == "Task"
def test_write_deduplicates_duplicate_ids(self):
store = TodoStore()
result = store.write([
{"id": "1", "content": "First version", "status": "pending"},
{"id": "2", "content": "Other task", "status": "pending"},
{"id": "1", "content": "Latest version", "status": "in_progress"},
])
assert result == [
{"id": "2", "content": "Other task", "status": "pending"},
{"id": "1", "content": "Latest version", "status": "in_progress"},
]
class TestHasItems:
def test_empty_store(self):
store = TodoStore()
assert store.has_items() is False
def test_non_empty_store(self):
store = TodoStore()
store.write([{"id": "1", "content": "x", "status": "pending"}])
assert store.has_items() is True
class TestFormatForInjection:
def test_empty_returns_none(self):
store = TodoStore()
assert store.format_for_injection() is None
def test_non_empty_has_markers(self):
store = TodoStore()
store.write([
{"id": "1", "content": "Do thing", "status": "completed"},
{"id": "2", "content": "Next", "status": "pending"},
{"id": "3", "content": "Working", "status": "in_progress"},
])
text = store.format_for_injection()
# Completed items are filtered out of injection
assert "[x]" not in text
assert "Do thing" not in text
# Active items are included
assert "[ ]" in text
assert "[>]" in text
assert "Next" in text
assert "Working" in text
assert "context compression" in text.lower()
class TestMergeMode:
def test_update_existing_by_id(self):
store = TodoStore()
store.write([
{"id": "1", "content": "Original", "status": "pending"},
])
store.write(
[{"id": "1", "status": "completed"}],
merge=True,
)
items = store.read()
assert len(items) == 1
assert items[0]["status"] == "completed"
assert items[0]["content"] == "Original"
def test_merge_appends_new(self):
store = TodoStore()
store.write([{"id": "1", "content": "First", "status": "pending"}])
store.write(
[{"id": "2", "content": "Second", "status": "pending"}],
merge=True,
)
items = store.read()
assert len(items) == 2
class TestTodoToolFunction:
def test_read_mode(self):
store = TodoStore()
store.write([{"id": "1", "content": "Task", "status": "pending"}])
result = json.loads(todo_tool(store=store))
assert result["summary"]["total"] == 1
assert result["summary"]["pending"] == 1
def test_write_mode(self):
store = TodoStore()
result = json.loads(todo_tool(
todos=[{"id": "1", "content": "New", "status": "in_progress"}],
store=store,
))
assert result["summary"]["in_progress"] == 1
def test_no_store_returns_error(self):
result = json.loads(todo_tool())
assert "error" in result
class TestTodoStoreBounds:
"""Bounds on persisted todo state (GHSA-5g4g-6jrg-mw3g hardening).
The todo list is re-injected into context after every compression event,
so an unbounded item — whether authored by the model or replayed from
caller-supplied history on the API server's _hydrate_todo_store path —
would defeat the compression it rides through. These pin the caps.
Not a security boundary (the API surface is authenticated and the caller
supplies their own history); this is footgun containment / parity.
"""
def test_oversized_content_is_truncated(self):
from tools.todo_tool import MAX_TODO_CONTENT_CHARS
store = TodoStore()
store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}])
item = store.read()[0]
assert len(item["content"]) <= MAX_TODO_CONTENT_CHARS
assert item["content"].endswith("… [truncated]")
def test_injection_block_is_bounded(self):
from tools.todo_tool import MAX_TODO_CONTENT_CHARS
store = TodoStore()
store.write([{"id": "1", "content": "A" * 50001, "status": "pending"}])
inj = store.format_for_injection()
# Before the fix this was ~50085 chars; now it tracks the cap.
assert len(inj) < MAX_TODO_CONTENT_CHARS + 200
def test_merge_update_content_is_capped(self):
"""The merge path updates content directly, bypassing _validate —
verify it is capped too."""
from tools.todo_tool import MAX_TODO_CONTENT_CHARS
store = TodoStore()
store.write([{"id": "1", "content": "short", "status": "pending"}])
store.write([{"id": "1", "content": "B" * 50001}], merge=True)
assert len(store.read()[0]["content"]) <= MAX_TODO_CONTENT_CHARS
def test_item_count_is_bounded(self):
from tools.todo_tool import MAX_TODO_ITEMS
store = TodoStore()
store.write([
{"id": str(i), "content": f"task {i}", "status": "pending"}
for i in range(5000)
])
assert len(store.read()) == MAX_TODO_ITEMS
def test_normal_list_is_unchanged(self):
"""No regression: ordinary plans pass through untouched (no marker,
same content, same order)."""
store = TodoStore()
store.write([
{"id": "1", "content": "write the report", "status": "in_progress"},
{"id": "2", "content": "review PR", "status": "pending"},
])
items = store.read()
assert [i["content"] for i in items] == ["write the report", "review PR"]
assert "[truncated]" not in items[0]["content"]