mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-12 03:42:08 +00:00
fix(kanban): heartbeat tool extends claim TTL, not just last_heartbeat_at
The kanban_heartbeat tool called heartbeat_worker but never heartbeat_claim, so a worker that loops the tool while a single tool call blocks the agent for >DEFAULT_CLAIM_TTL_SECONDS still got reclaimed by release_stale_claims. The function name and heartbeat_claim's own docstring imply otherwise: "Workers that know they'll exceed 15 minutes should call this every few minutes to keep ownership." But there was no caller in the worker tool path. Workers couldn't invoke heartbeat_claim themselves either — it isn't exposed as a tool. Fix: _handle_heartbeat now calls heartbeat_claim first, reading HERMES_KANBAN_CLAIM_LOCK from the worker env (the dispatcher pins this in _default_spawn). Falls back to _claimer_id() for locally- driven workers that didn't go through dispatcher spawn. Test: tests/tools/test_kanban_tools.py::test_heartbeat_extends_claim_expires rewinds claim_expires into the past, calls the tool, and asserts the new value is at least now + DEFAULT_CLAIM_TTL_SECONDS // 2. Verified to fail against the unfixed code (claim_expires stays at the rewound value). Closes the root cause underlying the symptom in #21141 (15-min respawns of long-running workers). #21141 separately addresses post-reclaim cleanup; this fixes the upstream "shouldn't have been reclaimed in the first place" half.
This commit is contained in:
parent
bf843adf05
commit
40b51c93a2
2 changed files with 72 additions and 1 deletions
|
|
@ -214,6 +214,61 @@ def test_heartbeat_without_note(worker_env):
|
|||
assert d["ok"] is True
|
||||
|
||||
|
||||
def test_heartbeat_extends_claim_expires(worker_env):
|
||||
"""The kanban_heartbeat tool MUST extend claim_expires, not just
|
||||
update last_heartbeat_at — otherwise long-running workers loop the
|
||||
heartbeat tool diligently and still get reclaimed by
|
||||
release_stale_claims at DEFAULT_CLAIM_TTL_SECONDS.
|
||||
|
||||
Regression test for the bug where _handle_heartbeat called
|
||||
heartbeat_worker but never heartbeat_claim, so claim_expires sat
|
||||
static while last_heartbeat_at advanced.
|
||||
"""
|
||||
import time as _time
|
||||
from hermes_cli import kanban_db as kb
|
||||
from tools import kanban_tools as kt
|
||||
|
||||
# Rewind claim_expires into the past so any forward movement is
|
||||
# unambiguous (avoids time.sleep flakiness).
|
||||
conn = kb.connect()
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE tasks SET claim_expires = ? WHERE id = ?",
|
||||
(1, worker_env),
|
||||
)
|
||||
conn.commit()
|
||||
before = conn.execute(
|
||||
"SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,)
|
||||
).fetchone()["claim_expires"]
|
||||
finally:
|
||||
conn.close()
|
||||
assert before == 1
|
||||
|
||||
out = kt._handle_heartbeat({"note": "still alive"})
|
||||
assert json.loads(out).get("ok") is True
|
||||
|
||||
conn = kb.connect()
|
||||
try:
|
||||
after = conn.execute(
|
||||
"SELECT claim_expires FROM tasks WHERE id = ?", (worker_env,)
|
||||
).fetchone()["claim_expires"]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
now = int(_time.time())
|
||||
# claim_expires should be roughly now + DEFAULT_CLAIM_TTL_SECONDS.
|
||||
# We assert a generous floor (now + half the default TTL) to keep the
|
||||
# test stable against future TTL changes.
|
||||
assert after > before, (
|
||||
f"claim_expires did not advance ({before} -> {after}); workers "
|
||||
f"would be reclaimed at TTL despite heartbeating"
|
||||
)
|
||||
assert after >= now + (kb.DEFAULT_CLAIM_TTL_SECONDS // 2), (
|
||||
f"claim_expires={after} is suspiciously close to now={now}; "
|
||||
f"expected at least now + {kb.DEFAULT_CLAIM_TTL_SECONDS // 2}"
|
||||
)
|
||||
|
||||
|
||||
def test_comment_happy_path(worker_env):
|
||||
from tools import kanban_tools as kt
|
||||
out = kt._handle_comment({
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue