From 40b51c93a2d9bce63d656ccb3751e624711e6e3c Mon Sep 17 00:00:00 2001 From: stephen0110 <51599529+stephen0110@users.noreply.github.com> Date: Thu, 7 May 2026 12:50:30 +0200 Subject: [PATCH] fix(kanban): heartbeat tool extends claim TTL, not just last_heartbeat_at MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kanban_heartbeat tool called heartbeat_worker but never heartbeat_claim, so a worker that loops the tool while a single tool call blocks the agent for >DEFAULT_CLAIM_TTL_SECONDS still got reclaimed by release_stale_claims. The function name and heartbeat_claim's own docstring imply otherwise: "Workers that know they'll exceed 15 minutes should call this every few minutes to keep ownership." But there was no caller in the worker tool path. Workers couldn't invoke heartbeat_claim themselves either — it isn't exposed as a tool. Fix: _handle_heartbeat now calls heartbeat_claim first, reading HERMES_KANBAN_CLAIM_LOCK from the worker env (the dispatcher pins this in _default_spawn). Falls back to _claimer_id() for locally- driven workers that didn't go through dispatcher spawn. Test: tests/tools/test_kanban_tools.py::test_heartbeat_extends_claim_expires rewinds claim_expires into the past, calls the tool, and asserts the new value is at least now + DEFAULT_CLAIM_TTL_SECONDS // 2. Verified to fail against the unfixed code (claim_expires stays at the rewound value). Closes the root cause underlying the symptom in #21141 (15-min respawns of long-running workers). #21141 separately addresses post-reclaim cleanup; this fixes the upstream "shouldn't have been reclaimed in the first place" half. --- tests/tools/test_kanban_tools.py | 55 ++++++++++++++++++++++++++++++++ tools/kanban_tools.py | 18 ++++++++++- 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/tests/tools/test_kanban_tools.py b/tests/tools/test_kanban_tools.py index f00a33d544..aa7168da6c 100644 --- a/tests/tools/test_kanban_tools.py +++ b/tests/tools/test_kanban_tools.py @@ -214,6 +214,61 @@ def test_heartbeat_without_note(worker_env): assert d["ok"] is True +def test_heartbeat_extends_claim_expires(worker_env): + """The kanban_heartbeat tool MUST extend claim_expires, not just + update last_heartbeat_at — otherwise long-running workers loop the + heartbeat tool diligently and still get reclaimed by + release_stale_claims at DEFAULT_CLAIM_TTL_SECONDS. + + Regression test for the bug where _handle_heartbeat called + heartbeat_worker but never heartbeat_claim, so claim_expires sat + static while last_heartbeat_at advanced. + """ + import time as _time + from hermes_cli import kanban_db as kb + from tools import kanban_tools as kt + + # Rewind claim_expires into the past so any forward movement is + # unambiguous (avoids time.sleep flakiness). + conn = kb.connect() + try: + conn.execute( + "UPDATE tasks SET claim_expires = ? WHERE id = ?", + (1, worker_env), + ) + conn.commit() + before = conn.execute( + "SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,) + ).fetchone()["claim_expires"] + finally: + conn.close() + assert before == 1 + + out = kt._handle_heartbeat({"note": "still alive"}) + assert json.loads(out).get("ok") is True + + conn = kb.connect() + try: + after = conn.execute( + "SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,) + ).fetchone()["claim_expires"] + finally: + conn.close() + + now = int(_time.time()) + # claim_expires should be roughly now + DEFAULT_CLAIM_TTL_SECONDS. + # We assert a generous floor (now + half the default TTL) to keep the + # test stable against future TTL changes. + assert after > before, ( + f"claim_expires did not advance ({before} -> {after}); workers " + f"would be reclaimed at TTL despite heartbeating" + ) + assert after >= now + (kb.DEFAULT_CLAIM_TTL_SECONDS // 2), ( + f"claim_expires={after} is suspiciously close to now={now}; " + f"expected at least now + {kb.DEFAULT_CLAIM_TTL_SECONDS // 2}" + ) + + def test_comment_happy_path(worker_env): from tools import kanban_tools as kt out = kt._handle_comment({ diff --git a/tools/kanban_tools.py b/tools/kanban_tools.py index 2f40b3f0de..2326895554 100644 --- a/tools/kanban_tools.py +++ b/tools/kanban_tools.py @@ -315,7 +315,15 @@ def _handle_block(args: dict, **kw) -> str: def _handle_heartbeat(args: dict, **kw) -> str: - """Signal that the worker is still alive during a long operation.""" + """Signal that the worker is still alive during a long operation. + + Extends the claim TTL via ``heartbeat_claim`` AND records a heartbeat + event via ``heartbeat_worker``. Without the ``heartbeat_claim`` half, + a diligent worker that loops this tool while a single tool call + blocks the agent for >DEFAULT_CLAIM_TTL_SECONDS still gets reclaimed + by ``release_stale_claims`` — which is exactly the trap that + ``heartbeat_claim``'s docstring warns against. + """ tid = _default_task_id(args.get("task_id")) if not tid: return tool_error( @@ -328,6 +336,14 @@ def _handle_heartbeat(args: dict, **kw) -> str: try: kb, conn = _connect() try: + # Extend the claim TTL first. The dispatcher pins + # HERMES_KANBAN_CLAIM_LOCK in the worker env at spawn time + # (see _default_spawn in kanban_db.py); falling back to the + # default _claimer_id() covers locally-driven workers that + # never went through the dispatcher path. + claim_lock = os.environ.get("HERMES_KANBAN_CLAIM_LOCK") + kb.heartbeat_claim(conn, tid, claimer=claim_lock) + ok = kb.heartbeat_worker( conn, tid,