mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-26 01:01:40 +00:00
fix(process): correct detached crash recovery state
Previously crash recovery recreated detached sessions as if they were fully managed, so polls and kills could lie about liveness and the checkpoint could forget recovered jobs after the next restart. This commit refreshes recovered host-backed sessions from real PID state, keeps checkpoint data durable, and preserves notify watcher metadata while treating sandbox-only PIDs as non-recoverable. - Persist `pid_scope` in `tools/process_registry.py` and skip recovering sandbox-backed entries without a host-visible PID handle - Refresh detached sessions on access so `get`/`poll`/`wait` and active session queries observe exited processes instead of hanging forever - Allow recovered host PIDs to be terminated honestly and requeue `notify_on_complete` watchers during checkpoint recovery - Add regression tests for durable checkpoints, detached exit/kill behavior, sandbox skip logic, and recovered notify watchers
This commit is contained in:
parent
383db35925
commit
19b0ddce40
3 changed files with 241 additions and 13 deletions
|
|
@ -197,6 +197,26 @@ class TestCheckpointNotify:
|
|||
s = registry.get("proc_live")
|
||||
assert s.notify_on_complete is True
|
||||
|
||||
def test_recover_requeues_notify_watchers(self, registry, tmp_path):
|
||||
checkpoint = tmp_path / "procs.json"
|
||||
checkpoint.write_text(json.dumps([{
|
||||
"session_id": "proc_live",
|
||||
"command": "sleep 999",
|
||||
"pid": os.getpid(),
|
||||
"task_id": "t1",
|
||||
"session_key": "sk1",
|
||||
"watcher_platform": "telegram",
|
||||
"watcher_chat_id": "123",
|
||||
"watcher_thread_id": "42",
|
||||
"watcher_interval": 5,
|
||||
"notify_on_complete": True,
|
||||
}]))
|
||||
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
||||
recovered = registry.recover_from_checkpoint()
|
||||
assert recovered == 1
|
||||
assert len(registry.pending_watchers) == 1
|
||||
assert registry.pending_watchers[0]["notify_on_complete"] is True
|
||||
|
||||
def test_recover_defaults_false(self, registry, tmp_path):
|
||||
"""Old checkpoint entries without the field default to False."""
|
||||
checkpoint = tmp_path / "procs.json"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue