mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-12 03:42:08 +00:00
fix(kanban): stop reclaimed workers before retry
This commit is contained in:
parent
63bd690a50
commit
06f24351c5
3 changed files with 141 additions and 38 deletions
|
|
@ -3283,17 +3283,28 @@ def test_complete_prose_scan_ignores_existing_ids(kanban_home):
|
|||
# Recovery helpers (reclaim + reassign)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_reclaim_task_resets_running_to_ready(kanban_home):
|
||||
def test_reclaim_task_resets_running_to_ready(kanban_home, monkeypatch):
|
||||
"""Manual reclaim releases the claim, resets status, and emits a
|
||||
``reclaimed`` event even when claim_expires has not passed."""
|
||||
import signal
|
||||
import time
|
||||
import secrets
|
||||
import hermes_cli.kanban_db as _kb
|
||||
conn = kb.connect()
|
||||
try:
|
||||
t = kb.create_task(conn, title="stuck", assignee="broken")
|
||||
# Simulate a live claim (not expired).
|
||||
lock = secrets.token_hex(8)
|
||||
lock = f"{_kb._claimer_id().split(':', 1)[0]}:{secrets.token_hex(8)}"
|
||||
future = int(time.time()) + 3600
|
||||
killed: list[int] = []
|
||||
state = {"alive": True}
|
||||
|
||||
def _signal(pid, sig):
|
||||
killed.append(sig)
|
||||
if sig == signal.SIGTERM:
|
||||
state["alive"] = False
|
||||
|
||||
monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: state["alive"])
|
||||
conn.execute(
|
||||
"UPDATE tasks SET status='running', claim_lock=?, claim_expires=?, "
|
||||
"worker_pid=? WHERE id=?",
|
||||
|
|
@ -3312,7 +3323,7 @@ def test_reclaim_task_resets_running_to_ready(kanban_home):
|
|||
assert kb.release_stale_claims(conn) == 0
|
||||
|
||||
# reclaim_task should work immediately.
|
||||
assert kb.reclaim_task(conn, t, reason="test reason") is True
|
||||
assert kb.reclaim_task(conn, t, reason="test reason", signal_fn=_signal) is True
|
||||
|
||||
row = conn.execute(
|
||||
"SELECT status, claim_lock, worker_pid FROM tasks WHERE id=?",
|
||||
|
|
@ -3333,6 +3344,9 @@ def test_reclaim_task_resets_running_to_ready(kanban_home):
|
|||
assert len(reclaim_evs) == 1
|
||||
assert reclaim_evs[0].get("manual") is True
|
||||
assert reclaim_evs[0].get("reason") == "test reason"
|
||||
assert reclaim_evs[0].get("termination_attempted") is True
|
||||
assert reclaim_evs[0].get("terminated") is True
|
||||
assert killed == [signal.SIGTERM]
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
|
|
|||
|
|
@ -168,18 +168,33 @@ def test_claim_fails_on_non_ready(kanban_home):
|
|||
assert kb.claim_task(conn, t) is None
|
||||
|
||||
|
||||
def test_stale_claim_reclaimed(kanban_home):
|
||||
def test_stale_claim_reclaimed(kanban_home, monkeypatch):
|
||||
import signal
|
||||
import hermes_cli.kanban_db as _kb
|
||||
|
||||
with kb.connect() as conn:
|
||||
t = kb.create_task(conn, title="x", assignee="a")
|
||||
kb.claim_task(conn, t)
|
||||
host = _kb._claimer_id().split(":", 1)[0]
|
||||
kb.claim_task(conn, t, claimer=f"{host}:worker")
|
||||
killed: list[int] = []
|
||||
state = {"alive": True}
|
||||
|
||||
def _signal(pid, sig):
|
||||
killed.append(sig)
|
||||
if sig == signal.SIGTERM:
|
||||
state["alive"] = False
|
||||
|
||||
kb._set_worker_pid(conn, t, 12345)
|
||||
# Rewind claim_expires so it looks stale.
|
||||
conn.execute(
|
||||
"UPDATE tasks SET claim_expires = ? WHERE id = ?",
|
||||
(int(time.time()) - 3600, t),
|
||||
)
|
||||
reclaimed = kb.release_stale_claims(conn)
|
||||
monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: state["alive"])
|
||||
reclaimed = kb.release_stale_claims(conn, signal_fn=_signal)
|
||||
assert reclaimed == 1
|
||||
assert kb.get_task(conn, t).status == "ready"
|
||||
assert killed == [signal.SIGTERM]
|
||||
|
||||
|
||||
def test_max_runtime_uses_current_run_start_after_retry(kanban_home):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue