mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(tools): clarify kanban_complete phantom-card retry guidance
When kanban_complete rejects a created_cards list as hallucinated, the
task is intentionally left in-flight (the gate runs before the write
txn) so the worker can retry with a corrected list or pass
created_cards=[] to skip the check. The retry path already worked, but
the previous error wording read like a terminal failure and workers
were observed abandoning the run instead of trying again.
Spell out the recovery path explicitly in the tool_error response
("Your task is still in-flight ... Retry kanban_complete with ...") and
add regression coverage at both the kernel and tool layers so the
retry contract — and the wording the worker depends on to discover
it — is pinned.
Fixes #22923
This commit is contained in:
parent
2f00559d9e
commit
62cfe79e93
3 changed files with 181 additions and 2 deletions
|
|
@ -3539,6 +3539,76 @@ def test_complete_accepts_cross_worker_card_when_linked_as_child(kanban_home):
|
|||
conn.close()
|
||||
|
||||
|
||||
def test_complete_can_retry_after_phantom_rejection(kanban_home):
|
||||
"""A worker that hits the hallucinated-card gate must be able to
|
||||
retry kanban_complete on the same task — both with a corrected
|
||||
created_cards list and with an empty list (the documented escape
|
||||
hatch). Regression test for #22923, where workers were believed to
|
||||
be unrecoverable after the first rejection.
|
||||
"""
|
||||
conn = kb.connect()
|
||||
try:
|
||||
# Two parallel completing tasks so we can exercise both retry
|
||||
# shapes without status interference.
|
||||
parent_a = kb.create_task(conn, title="retry-empty", assignee="alice")
|
||||
kb.claim_task(conn, parent_a)
|
||||
parent_b = kb.create_task(conn, title="retry-corrected", assignee="alice")
|
||||
kb.claim_task(conn, parent_b)
|
||||
real = kb.create_task(
|
||||
conn, title="real-child", assignee="x", created_by="alice",
|
||||
)
|
||||
|
||||
# First attempt: phantom in the list rejects, task stays running.
|
||||
with pytest.raises(kb.HallucinatedCardsError):
|
||||
kb.complete_task(
|
||||
conn, parent_a,
|
||||
summary="oops",
|
||||
created_cards=["t_phantomdeadbeef"],
|
||||
)
|
||||
assert kb.get_task(conn, parent_a).status == "running"
|
||||
|
||||
# Retry with [] (escape hatch): gate is skipped, completion lands.
|
||||
ok = kb.complete_task(
|
||||
conn, parent_a,
|
||||
summary="retry without claims",
|
||||
created_cards=[],
|
||||
)
|
||||
assert ok is True
|
||||
assert kb.get_task(conn, parent_a).status == "done"
|
||||
|
||||
# Same flow on parent_b, but recover via a corrected list rather
|
||||
# than the empty escape hatch.
|
||||
with pytest.raises(kb.HallucinatedCardsError):
|
||||
kb.complete_task(
|
||||
conn, parent_b,
|
||||
summary="oops",
|
||||
created_cards=[real, "t_anotherphantom"],
|
||||
)
|
||||
assert kb.get_task(conn, parent_b).status == "running"
|
||||
|
||||
ok = kb.complete_task(
|
||||
conn, parent_b,
|
||||
summary="retry with corrected list",
|
||||
created_cards=[real],
|
||||
)
|
||||
assert ok is True
|
||||
assert kb.get_task(conn, parent_b).status == "done"
|
||||
|
||||
# Both audit events landed; the eventual completion event is
|
||||
# also present on each task.
|
||||
for parent in (parent_a, parent_b):
|
||||
kinds = [
|
||||
r["kind"] for r in conn.execute(
|
||||
"SELECT kind FROM task_events WHERE task_id=? ORDER BY id",
|
||||
(parent,),
|
||||
)
|
||||
]
|
||||
assert kinds.count("completion_blocked_hallucination") == 1
|
||||
assert kinds.count("completed") == 1
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_complete_prose_scan_flags_nonexistent_ids(kanban_home):
|
||||
"""Successful completion whose summary references a ``t_<hex>`` id
|
||||
that doesn't resolve emits a ``suspected_hallucinated_references``
|
||||
|
|
|
|||
|
|
@ -330,6 +330,106 @@ def test_complete_rejects_non_dict_metadata(worker_env):
|
|||
assert json.loads(out).get("error")
|
||||
|
||||
|
||||
def test_complete_phantom_card_message_advertises_retry(worker_env):
|
||||
"""A phantom-card rejection must surface a tool_error that explicitly
|
||||
tells the worker the task is still in-flight and how to retry — the
|
||||
worker has no other channel to discover that. Regression for #22923,
|
||||
where the previous wording read like a terminal failure and workers
|
||||
routinely abandoned the run instead of trying again.
|
||||
"""
|
||||
from hermes_cli import kanban_db as kb
|
||||
from tools import kanban_tools as kt
|
||||
|
||||
out = kt._handle_complete({
|
||||
"summary": "oops claimed a phantom",
|
||||
"created_cards": ["t_phantomdeadbeef"],
|
||||
})
|
||||
err = json.loads(out).get("error", "")
|
||||
assert err, f"expected an error, got {out!r}"
|
||||
# Phantom id surfaced verbatim.
|
||||
assert "t_phantomdeadbeef" in err
|
||||
# The retry-is-supported phrasing — these are the literal cues a
|
||||
# worker reads to decide whether to retry vs block/abandon. If a
|
||||
# future change rewords the message, these checks will catch the
|
||||
# regression. See #22923 for the failure mode.
|
||||
assert "still in-flight" in err
|
||||
assert "Retry kanban_complete" in err
|
||||
assert "created_cards=[]" in err
|
||||
|
||||
# Critically: the task is genuinely still in-flight — the gate
|
||||
# rejection did not mutate state, so the worker's retry can land.
|
||||
conn = kb.connect()
|
||||
try:
|
||||
assert kb.get_task(conn, worker_env).status == "running"
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_complete_retry_with_empty_created_cards_succeeds(worker_env):
|
||||
"""After a phantom rejection, retrying kanban_complete with
|
||||
created_cards=[] (the documented escape hatch) must complete the
|
||||
task. Regression for #22923."""
|
||||
from hermes_cli import kanban_db as kb
|
||||
from tools import kanban_tools as kt
|
||||
|
||||
# Hit the gate first.
|
||||
rejected = json.loads(kt._handle_complete({
|
||||
"summary": "oops",
|
||||
"created_cards": ["t_phantomdeadbeef"],
|
||||
}))
|
||||
assert rejected.get("error")
|
||||
|
||||
# Retry with the escape hatch.
|
||||
ok = json.loads(kt._handle_complete({
|
||||
"summary": "retry without claims",
|
||||
"created_cards": [],
|
||||
}))
|
||||
assert ok.get("ok") is True
|
||||
|
||||
conn = kb.connect()
|
||||
try:
|
||||
assert kb.get_task(conn, worker_env).status == "done"
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_complete_retry_with_corrected_created_cards_succeeds(worker_env):
|
||||
"""After a phantom rejection, retrying kanban_complete with a
|
||||
corrected created_cards list (phantom ids removed) must complete the
|
||||
task. Regression for #22923."""
|
||||
from hermes_cli import kanban_db as kb
|
||||
from tools import kanban_tools as kt
|
||||
|
||||
# Create a real child via the tool so it gets the worker-profile
|
||||
# attribution the gate trusts.
|
||||
child = json.loads(kt._handle_create({
|
||||
"title": "real child", "assignee": "peer",
|
||||
}))
|
||||
assert child["ok"]
|
||||
real_id = child["task_id"]
|
||||
|
||||
# First attempt mixes real + phantom — gate rejects.
|
||||
rejected = json.loads(kt._handle_complete({
|
||||
"summary": "oops",
|
||||
"created_cards": [real_id, "t_phantomdeadbeef"],
|
||||
}))
|
||||
assert rejected.get("error")
|
||||
assert "t_phantomdeadbeef" in rejected["error"]
|
||||
|
||||
# Retry with corrected list.
|
||||
ok = json.loads(kt._handle_complete({
|
||||
"summary": "retry with corrected list",
|
||||
"created_cards": [real_id],
|
||||
}))
|
||||
assert ok.get("ok") is True
|
||||
|
||||
conn = kb.connect()
|
||||
try:
|
||||
assert kb.get_task(conn, worker_env).status == "done"
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_block_happy_path(worker_env):
|
||||
from tools import kanban_tools as kt
|
||||
out = kt._handle_block({"reason": "need clarification"})
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue