fix(kanban): heartbeat tool extends claim TTL, not just last_heartbeat_at

The kanban_heartbeat tool called heartbeat_worker but never
heartbeat_claim, so a worker that loops the tool while a single tool
call blocks the agent for >DEFAULT_CLAIM_TTL_SECONDS still got
reclaimed by release_stale_claims. The function name and
heartbeat_claim's own docstring imply otherwise:

  "Workers that know they'll exceed 15 minutes should call this
   every few minutes to keep ownership."

But there was no caller in the worker tool path. Workers couldn't
invoke heartbeat_claim themselves either — it isn't exposed as a tool.

Fix: _handle_heartbeat now calls heartbeat_claim first, reading
HERMES_KANBAN_CLAIM_LOCK from the worker env (the dispatcher pins
this in _default_spawn). Falls back to _claimer_id() for locally-
driven workers that didn't go through dispatcher spawn.

Test: tests/tools/test_kanban_tools.py::test_heartbeat_extends_claim_expires
rewinds claim_expires into the past, calls the tool, and asserts the
new value is at least now + DEFAULT_CLAIM_TTL_SECONDS // 2. Verified to
fail against the unfixed code (claim_expires stays at the rewound
value).

Closes the root cause underlying the symptom in #21141 (15-min
respawns of long-running workers). #21141 separately addresses
post-reclaim cleanup; this fixes the upstream "shouldn't have been
reclaimed in the first place" half.
This commit is contained in:
stephen0110 2026-05-07 12:50:30 +02:00 committed by Teknium
parent bf843adf05
commit 40b51c93a2
2 changed files with 72 additions and 1 deletions

View file

@ -214,6 +214,61 @@ def test_heartbeat_without_note(worker_env):
assert d["ok"] is True assert d["ok"] is True
def test_heartbeat_extends_claim_expires(worker_env):
"""The kanban_heartbeat tool MUST extend claim_expires, not just
update last_heartbeat_at otherwise long-running workers loop the
heartbeat tool diligently and still get reclaimed by
release_stale_claims at DEFAULT_CLAIM_TTL_SECONDS.
Regression test for the bug where _handle_heartbeat called
heartbeat_worker but never heartbeat_claim, so claim_expires sat
static while last_heartbeat_at advanced.
"""
import time as _time
from hermes_cli import kanban_db as kb
from tools import kanban_tools as kt
# Rewind claim_expires into the past so any forward movement is
# unambiguous (avoids time.sleep flakiness).
conn = kb.connect()
try:
conn.execute(
"UPDATE tasks SET claim_expires = ? WHERE id = ?",
(1, worker_env),
)
conn.commit()
before = conn.execute(
"SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,)
).fetchone()["claim_expires"]
finally:
conn.close()
assert before == 1
out = kt._handle_heartbeat({"note": "still alive"})
assert json.loads(out).get("ok") is True
conn = kb.connect()
try:
after = conn.execute(
"SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,)
).fetchone()["claim_expires"]
finally:
conn.close()
now = int(_time.time())
# claim_expires should be roughly now + DEFAULT_CLAIM_TTL_SECONDS.
# We assert a generous floor (now + half the default TTL) to keep the
# test stable against future TTL changes.
assert after > before, (
f"claim_expires did not advance ({before} -> {after}); workers "
f"would be reclaimed at TTL despite heartbeating"
)
assert after >= now + (kb.DEFAULT_CLAIM_TTL_SECONDS // 2), (
f"claim_expires={after} is suspiciously close to now={now}; "
f"expected at least now + {kb.DEFAULT_CLAIM_TTL_SECONDS // 2}"
)
def test_comment_happy_path(worker_env): def test_comment_happy_path(worker_env):
from tools import kanban_tools as kt from tools import kanban_tools as kt
out = kt._handle_comment({ out = kt._handle_comment({

View file

@ -315,7 +315,15 @@ def _handle_block(args: dict, **kw) -> str:
def _handle_heartbeat(args: dict, **kw) -> str: def _handle_heartbeat(args: dict, **kw) -> str:
"""Signal that the worker is still alive during a long operation.""" """Signal that the worker is still alive during a long operation.
Extends the claim TTL via ``heartbeat_claim`` AND records a heartbeat
event via ``heartbeat_worker``. Without the ``heartbeat_claim`` half,
a diligent worker that loops this tool while a single tool call
blocks the agent for >DEFAULT_CLAIM_TTL_SECONDS still gets reclaimed
by ``release_stale_claims`` which is exactly the trap that
``heartbeat_claim``'s docstring warns against.
"""
tid = _default_task_id(args.get("task_id")) tid = _default_task_id(args.get("task_id"))
if not tid: if not tid:
return tool_error( return tool_error(
@ -328,6 +336,14 @@ def _handle_heartbeat(args: dict, **kw) -> str:
try: try:
kb, conn = _connect() kb, conn = _connect()
try: try:
# Extend the claim TTL first. The dispatcher pins
# HERMES_KANBAN_CLAIM_LOCK in the worker env at spawn time
# (see _default_spawn in kanban_db.py); falling back to the
# default _claimer_id() covers locally-driven workers that
# never went through the dispatcher path.
claim_lock = os.environ.get("HERMES_KANBAN_CLAIM_LOCK")
kb.heartbeat_claim(conn, tid, claimer=claim_lock)
ok = kb.heartbeat_worker( ok = kb.heartbeat_worker(
conn, conn,
tid, tid,