guard kanban worker lifecycle by run id

This commit is contained in:
misery-hl 2026-05-04 09:39:47 -07:00 committed by Teknium
parent f0d278412f
commit 56b4795115
5 changed files with 243 additions and 36 deletions

View file

@ -1186,6 +1186,79 @@ def test_multiple_attempts_preserved_as_runs(kanban_home):
conn.close()
def test_stale_run_cannot_complete_new_attempt(kanban_home, monkeypatch):
"""A worker from an earlier attempt cannot close a later retry."""
import hermes_cli.kanban_db as _kb
conn = kb.connect()
try:
tid = kb.create_task(conn, title="retry guarded", assignee="worker")
kb.claim_task(conn, tid)
run1 = kb.latest_run(conn, tid)
kb._set_worker_pid(conn, tid, 98765)
monkeypatch.setattr(_kb, "_pid_alive", lambda pid: False)
assert kb.detect_crashed_workers(conn) == [tid]
kb.claim_task(conn, tid)
run2 = kb.latest_run(conn, tid)
assert run2.id != run1.id
assert not kb.complete_task(
conn,
tid,
summary="late stale completion",
expected_run_id=run1.id,
)
task = kb.get_task(conn, tid)
assert task.status == "running"
assert task.current_run_id == run2.id
assert kb.complete_task(
conn,
tid,
summary="current completion",
expected_run_id=run2.id,
)
runs = kb.list_runs(conn, tid)
assert [r.outcome for r in runs] == ["crashed", "completed"]
assert runs[-1].summary == "current completion"
finally:
conn.close()
def test_stale_run_cannot_block_or_heartbeat_new_attempt(kanban_home, monkeypatch):
"""Stale retry attempts cannot mutate the active run lifecycle."""
import hermes_cli.kanban_db as _kb
conn = kb.connect()
try:
tid = kb.create_task(conn, title="retry heartbeat guarded", assignee="worker")
kb.claim_task(conn, tid)
run1 = kb.latest_run(conn, tid)
kb._set_worker_pid(conn, tid, 98765)
monkeypatch.setattr(_kb, "_pid_alive", lambda pid: False)
assert kb.detect_crashed_workers(conn) == [tid]
kb.claim_task(conn, tid)
run2 = kb.latest_run(conn, tid)
assert run2.id != run1.id
assert not kb.heartbeat_worker(conn, tid, note="late", expected_run_id=run1.id)
assert not kb.block_task(conn, tid, reason="late block", expected_run_id=run1.id)
task = kb.get_task(conn, tid)
assert task.status == "running"
assert task.current_run_id == run2.id
assert task.last_heartbeat_at is None
assert kb.heartbeat_worker(conn, tid, note="current", expected_run_id=run2.id)
assert kb.block_task(conn, tid, reason="current block", expected_run_id=run2.id)
assert kb.get_task(conn, tid).status == "blocked"
finally:
conn.close()
def test_run_on_block_with_reason(kanban_home):
conn = kb.connect()
try:

View file

@ -611,6 +611,44 @@ def test_worker_complete_own_task_still_works(worker_env):
assert d.get("ok") is True and d.get("task_id") == worker_env
def test_worker_complete_rejects_stale_run_id(worker_env, monkeypatch):
"""A retried worker cannot complete the task using an old run token."""
from hermes_cli import kanban_db as kb
import hermes_cli.kanban_db as _kb
conn = kb.connect()
try:
run1 = kb.latest_run(conn, worker_env)
kb._set_worker_pid(conn, worker_env, 98765)
monkeypatch.setattr(_kb, "_pid_alive", lambda pid: False)
assert kb.detect_crashed_workers(conn) == [worker_env]
kb.claim_task(conn, worker_env)
run2 = kb.latest_run(conn, worker_env)
assert run2.id != run1.id
finally:
conn.close()
from tools import kanban_tools as kt
monkeypatch.setenv("HERMES_KANBAN_RUN_ID", str(run1.id))
out = kt._handle_complete({"summary": "late stale completion"})
d = json.loads(out)
assert d.get("ok") is not True
conn = kb.connect()
try:
task = kb.get_task(conn, worker_env)
assert task.status == "running"
assert task.current_run_id == run2.id
finally:
conn.close()
monkeypatch.setenv("HERMES_KANBAN_RUN_ID", str(run2.id))
out = kt._handle_complete({"summary": "current completion"})
d = json.loads(out)
assert d.get("ok") is True
def test_orchestrator_complete_any_task_allowed(monkeypatch, tmp_path):
"""Orchestrator profiles (no HERMES_KANBAN_TASK) can still complete
any task via explicit task_id. The check only applies to workers."""