fix(kanban): extend stale claim instead of killing live worker

Workers running slow models (e.g. kimi-k2.6) can spend longer than DEFAULT_CLAIM_TTL_SECONDS inside a single tool-free LLM call, making no tool calls and therefore not heartbeating. release_stale_claims previously reclaimed these healthy workers, producing the spawn-then-immediately-reclaim loop reported in #23025. When a stale-by-TTL claim's host-local worker PID is still alive, extend the claim (emit a claim_extended event) rather than killing it. enforce_max_runtime / detect_crashed_workers remain the upper bounds for genuinely wedged or dead workers. Reclaim events now also record claim_expires, last_heartbeat_at, worker_pid, and host_local so operators can see why a worker was killed.
2026-05-22 05:22:09 +00:00 · 2026-05-10 09:52:29 +02:00 · 2026-05-10 09:52:29 +02:00 · 88588b6159
commit 88588b6159
parent 3974a137c6
2 changed files with 155 additions and 9 deletions
--- a/tests/hermes_cli/test_kanban_db.py
+++ b/tests/hermes_cli/test_kanban_db.py
@ -177,12 +177,9 @@ def test_stale_claim_reclaimed(kanban_home, monkeypatch):
        host = _kb._claimer_id().split(":", 1)[0]
        kb.claim_task(conn, t, claimer=f"{host}:worker")
        killed: list[int] = []
-        state = {"alive": True}

-        def _signal(pid, sig):
+        def _signal(_pid, sig):
            killed.append(sig)
-            if sig == signal.SIGTERM:
-                state["alive"] = False

        kb._set_worker_pid(conn, t, 12345)
        # Rewind claim_expires so it looks stale.
@ -190,13 +187,96 @@ def test_stale_claim_reclaimed(kanban_home, monkeypatch):
            "UPDATE tasks SET claim_expires = ? WHERE id = ?",
            (int(time.time()) - 3600, t),
        )
-        monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: state["alive"])
+        # Worker PID has died — exactly the case ``release_stale_claims``
+        # should still reclaim (post-#23025: live PIDs are now extended).
+        monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
        reclaimed = kb.release_stale_claims(conn, signal_fn=_signal)
        assert reclaimed == 1
        assert kb.get_task(conn, t).status == "ready"
        assert killed == [signal.SIGTERM]


+def test_stale_claim_with_live_pid_extends_instead_of_reclaiming(
+    kanban_home, monkeypatch,
+):
+    """A stale-by-TTL claim whose worker PID is still alive should be
+    extended, not reclaimed (#23025). Slow models can spend longer than
+    ``DEFAULT_CLAIM_TTL_SECONDS`` inside a single tool-free LLM call;
+    killing those healthy workers produces a respawn loop with zero
+    progress."""
+    import hermes_cli.kanban_db as _kb
+
+    with kb.connect() as conn:
+        t = kb.create_task(conn, title="x", assignee="a")
+        host = _kb._claimer_id().split(":", 1)[0]
+        kb.claim_task(conn, t, claimer=f"{host}:worker")
+        kb._set_worker_pid(conn, t, 12345)
+
+        old_expires = int(time.time()) - 60
+        conn.execute(
+            "UPDATE tasks SET claim_expires = ? WHERE id = ?",
+            (old_expires, t),
+        )
+
+        monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: True)
+        killed: list[int] = []
+        reclaimed = kb.release_stale_claims(
+            conn, signal_fn=lambda _p, sig: killed.append(sig),
+        )
+        assert reclaimed == 0
+        task = kb.get_task(conn, t)
+        assert task.status == "running"
+        assert task.claim_expires is not None
+        assert task.claim_expires > old_expires
+        assert killed == []  # live worker not killed
+
+        kinds = [
+            r["kind"] for r in conn.execute(
+                "SELECT kind FROM task_events WHERE task_id = ?", (t,),
+            ).fetchall()
+        ]
+        assert "claim_extended" in kinds
+        assert "reclaimed" not in kinds
+
+
+def test_stale_claim_reclaim_event_records_diagnostic_payload(
+    kanban_home, monkeypatch,
+):
+    """``reclaimed`` events should carry claim_expires, last_heartbeat_at,
+    and worker_pid so operators can diagnose why a claim went stale
+    (#23025: previous payload only had ``stale_lock`` which gives no
+    timing context)."""
+    import json
+    import hermes_cli.kanban_db as _kb
+
+    with kb.connect() as conn:
+        t = kb.create_task(conn, title="x", assignee="a")
+        host = _kb._claimer_id().split(":", 1)[0]
+        kb.claim_task(conn, t, claimer=f"{host}:worker")
+        kb._set_worker_pid(conn, t, 12345)
+        old_expires = int(time.time()) - 3600
+        hb_at = int(time.time()) - 1800
+        conn.execute(
+            "UPDATE tasks SET claim_expires = ?, last_heartbeat_at = ? "
+            "WHERE id = ?",
+            (old_expires, hb_at, t),
+        )
+
+        monkeypatch.setattr(_kb, "_pid_alive", lambda _pid: False)
+        kb.release_stale_claims(conn, signal_fn=lambda _p, _s: None)
+        row = conn.execute(
+            "SELECT payload FROM task_events "
+            "WHERE task_id = ? AND kind = 'reclaimed'",
+            (t,),
+        ).fetchone()
+        assert row is not None
+        payload = json.loads(row["payload"])
+        assert payload["claim_expires"] == old_expires
+        assert payload["last_heartbeat_at"] == hb_at
+        assert payload["worker_pid"] == 12345
+        assert payload["host_local"] is True
+
+
 def test_max_runtime_uses_current_run_start_after_retry(kanban_home):
    """A retry should get a fresh max-runtime window.