mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-30 11:52:04 +00:00
fix(gateway): make bg-process reset TTL configurable + surface session-scoped processes
Follow-up to the cherry-picked #29212 (#29177): - Promote the 24h stale-process threshold to config.yaml (session_reset.bg_process_max_age_hours) instead of a hardcoded constant. 0 disables the cutoff (legacy: any live process blocks reset). Wired through GatewayConfig.default_reset_policy in gateway/run.py. - Bug 2: process(action=list) now resolves the gateway session_key from the contextvar and surfaces session-scoped background processes (a forgotten preview server under a different task), flagged session_scoped — so the agent/user can discover and kill the blocker. Previously the task-scoped list returned [] and the blocker was invisible. - Tests: config round-trip for the new field, cross-task list visibility. - Docs: messaging session-reset section.
This commit is contained in:
parent
33d8b66d5b
commit
a1ac6baac4
6 changed files with 101 additions and 13 deletions
|
|
@ -288,7 +288,13 @@ class SessionResetPolicy:
|
|||
idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours)
|
||||
notify: bool = True # Send a notification to the user when auto-reset occurs
|
||||
notify_exclude_platforms: tuple = ("api_server", "webhook") # Platforms that don't get reset notifications
|
||||
|
||||
# A background process this many hours old (or older) no longer blocks
|
||||
# session idle/daily reset. A forgotten preview server should not keep a
|
||||
# session alive forever (#29177). The process is NOT killed — only ignored
|
||||
# by the reset guard. Raise this if you run legitimate multi-day jobs whose
|
||||
# liveness should pin the conversation open.
|
||||
bg_process_max_age_hours: int = 24
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"mode": self.mode,
|
||||
|
|
@ -296,6 +302,7 @@ class SessionResetPolicy:
|
|||
"idle_minutes": self.idle_minutes,
|
||||
"notify": self.notify,
|
||||
"notify_exclude_platforms": list(self.notify_exclude_platforms),
|
||||
"bg_process_max_age_hours": self.bg_process_max_age_hours,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
|
|
@ -306,12 +313,14 @@ class SessionResetPolicy:
|
|||
idle_minutes = data.get("idle_minutes")
|
||||
notify = data.get("notify")
|
||||
exclude = data.get("notify_exclude_platforms")
|
||||
bg_max_age = data.get("bg_process_max_age_hours")
|
||||
return cls(
|
||||
mode=mode if mode is not None else "both",
|
||||
at_hour=at_hour if at_hour is not None else 4,
|
||||
idle_minutes=idle_minutes if idle_minutes is not None else 1440,
|
||||
notify=_coerce_bool(notify, True),
|
||||
notify_exclude_platforms=tuple(exclude) if exclude is not None else ("api_server", "webhook"),
|
||||
bg_process_max_age_hours=bg_max_age if bg_max_age is not None else 24,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2661,13 +2661,21 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
self._fallback_model = self._load_fallback_model()
|
||||
|
||||
# Wire process registry into session store for reset protection.
|
||||
# Processes older than MAX_ACTIVE_PROCESS_AGE (24h) are treated as
|
||||
# stale and no longer block session idle / daily reset — see #29177.
|
||||
from tools.process_registry import MAX_ACTIVE_PROCESS_AGE, process_registry
|
||||
# A background process older than the configured threshold (default 24h,
|
||||
# session_reset.bg_process_max_age_hours) is treated as stale and no
|
||||
# longer blocks session idle / daily reset — see #29177. The process is
|
||||
# NOT killed, only ignored by the reset guard.
|
||||
from tools.process_registry import process_registry
|
||||
_bg_max_age_hours = getattr(
|
||||
self.config.default_reset_policy, "bg_process_max_age_hours", 24
|
||||
)
|
||||
_bg_max_age_seconds = (
|
||||
_bg_max_age_hours * 3600 if _bg_max_age_hours and _bg_max_age_hours > 0 else None
|
||||
)
|
||||
self.session_store = SessionStore(
|
||||
self.config.sessions_dir, self.config,
|
||||
has_active_processes_fn=lambda key: process_registry.has_active_for_session(
|
||||
key, max_active_age=MAX_ACTIVE_PROCESS_AGE,
|
||||
key, max_active_age=_bg_max_age_seconds,
|
||||
),
|
||||
)
|
||||
self.delivery_router = DeliveryRouter(self.config)
|
||||
|
|
|
|||
|
|
@ -138,26 +138,31 @@ class TestGetConnectedPlatforms:
|
|||
|
||||
class TestSessionResetPolicy:
|
||||
def test_roundtrip(self):
|
||||
policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120)
|
||||
policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120,
|
||||
bg_process_max_age_hours=48)
|
||||
d = policy.to_dict()
|
||||
restored = SessionResetPolicy.from_dict(d)
|
||||
assert restored.mode == "idle"
|
||||
assert restored.at_hour == 6
|
||||
assert restored.idle_minutes == 120
|
||||
assert restored.bg_process_max_age_hours == 48
|
||||
|
||||
def test_defaults(self):
|
||||
policy = SessionResetPolicy()
|
||||
assert policy.mode == "both"
|
||||
assert policy.at_hour == 4
|
||||
assert policy.idle_minutes == 1440
|
||||
assert policy.bg_process_max_age_hours == 24
|
||||
|
||||
def test_from_dict_treats_null_values_as_defaults(self):
|
||||
restored = SessionResetPolicy.from_dict(
|
||||
{"mode": None, "at_hour": None, "idle_minutes": None}
|
||||
{"mode": None, "at_hour": None, "idle_minutes": None,
|
||||
"bg_process_max_age_hours": None}
|
||||
)
|
||||
assert restored.mode == "both"
|
||||
assert restored.at_hour == 4
|
||||
assert restored.idle_minutes == 1440
|
||||
assert restored.bg_process_max_age_hours == 24
|
||||
|
||||
def test_from_dict_coerces_quoted_false_notify(self):
|
||||
restored = SessionResetPolicy.from_dict({"notify": "false"})
|
||||
|
|
|
|||
|
|
@ -416,6 +416,34 @@ class TestListSessions:
|
|||
assert len(result) == 1
|
||||
assert result[0]["session_id"] == "proc_1"
|
||||
|
||||
def test_session_key_surfaces_cross_task_processes(self, registry):
|
||||
"""A bg process under the same gateway session but a DIFFERENT task is
|
||||
surfaced when session_key is passed, and flagged session_scoped (#29177).
|
||||
"""
|
||||
# Current turn's task = "t_now"; forgotten preview server = "t_old"
|
||||
# but both share gateway session_key "gw1".
|
||||
own = _make_session(sid="proc_own", task_id="t_now")
|
||||
own.session_key = "gw1"
|
||||
forgotten = _make_session(sid="proc_forgotten", task_id="t_old")
|
||||
forgotten.session_key = "gw1"
|
||||
other = _make_session(sid="proc_other", task_id="t_x")
|
||||
other.session_key = "gw_other"
|
||||
registry._running[own.id] = own
|
||||
registry._running[forgotten.id] = forgotten
|
||||
registry._running[other.id] = other
|
||||
|
||||
# Task-only (legacy) view sees just the current task's process.
|
||||
legacy = registry.list_sessions(task_id="t_now")
|
||||
assert {r["session_id"] for r in legacy} == {"proc_own"}
|
||||
|
||||
# With session_key, the forgotten process under the same gateway
|
||||
# session is surfaced and flagged; the unrelated session is not.
|
||||
result = registry.list_sessions(task_id="t_now", session_key="gw1")
|
||||
by_id = {r["session_id"]: r for r in result}
|
||||
assert set(by_id) == {"proc_own", "proc_forgotten"}
|
||||
assert by_id["proc_forgotten"].get("session_scoped") is True
|
||||
assert "session_scoped" not in by_id["proc_own"]
|
||||
|
||||
def test_list_entry_fields(self, registry):
|
||||
s = _make_session(output="preview text")
|
||||
registry._running[s.id] = s
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ CHECKPOINT_PATH = get_hermes_home() / "processes.json"
|
|||
MAX_OUTPUT_CHARS = 200_000 # 200KB rolling output buffer
|
||||
FINISHED_TTL_SECONDS = 1800 # Keep finished processes for 30 minutes
|
||||
MAX_PROCESSES = 64 # Max concurrent tracked processes (LRU pruning)
|
||||
MAX_ACTIVE_PROCESS_AGE = 86400 # 24h — stale processes no longer block session reset
|
||||
MAX_ACTIVE_PROCESS_AGE = 86400 # 24h default — see session_reset.bg_process_max_age_hours (#29177)
|
||||
|
||||
# Watch pattern rate limiting — PER SESSION.
|
||||
# Hard rule: at most ONE watch-match notification every WATCH_MIN_INTERVAL_SECONDS.
|
||||
|
|
@ -1515,15 +1515,28 @@ class ProcessRegistry:
|
|||
except Exception:
|
||||
return 0
|
||||
|
||||
def list_sessions(self, task_id: str = None) -> list:
|
||||
"""List all running and recently-finished processes."""
|
||||
def list_sessions(self, task_id: str = None, session_key: str = None) -> list:
|
||||
"""List all running and recently-finished processes.
|
||||
|
||||
When ``task_id`` is given, processes for that task are included. When
|
||||
``session_key`` is also given, session-scoped background processes
|
||||
(``background: true``) registered under that gateway session are
|
||||
surfaced too, even if they belong to a different task — so the agent
|
||||
can discover a forgotten preview server that is blocking session
|
||||
reset (#29177). Such cross-task entries are flagged with
|
||||
``"session_scoped": true``.
|
||||
"""
|
||||
with self._lock:
|
||||
all_sessions = list(self._running.values()) + list(self._finished.values())
|
||||
|
||||
all_sessions = [self._refresh_detached_session(s) for s in all_sessions]
|
||||
|
||||
if task_id:
|
||||
all_sessions = [s for s in all_sessions if s.task_id == task_id]
|
||||
if task_id or session_key:
|
||||
all_sessions = [
|
||||
s for s in all_sessions
|
||||
if (task_id and s.task_id == task_id)
|
||||
or (session_key and s.session_key == session_key)
|
||||
]
|
||||
|
||||
result = []
|
||||
for s in all_sessions:
|
||||
|
|
@ -1537,6 +1550,11 @@ class ProcessRegistry:
|
|||
"status": "exited" if s.exited else "running",
|
||||
"output_preview": s.output_buffer[-200:] if s.output_buffer else "",
|
||||
}
|
||||
# Flag processes surfaced only because they share the gateway
|
||||
# session (not the current task) — these are the long-lived
|
||||
# background processes a user may have forgotten about (#29177).
|
||||
if task_id and session_key and s.task_id != task_id and s.session_key == session_key:
|
||||
entry["session_scoped"] = True
|
||||
# Trigger metadata so a goal-loop judge can decide to wait on this
|
||||
# process's OWN signal (a watch-pattern match or completion), not
|
||||
# just its exit. A watcher with watch_patterns may never exit.
|
||||
|
|
@ -2070,7 +2088,18 @@ def _handle_process(args, **kw):
|
|||
session_id = str(args.get("session_id", "")) if args.get("session_id") is not None else ""
|
||||
|
||||
if action == "list":
|
||||
return json.dumps({"processes": process_registry.list_sessions(task_id=task_id)}, ensure_ascii=False)
|
||||
# Surface session-scoped background processes (e.g. a forgotten
|
||||
# preview server) in addition to this task's own — they share the
|
||||
# gateway session_key and can block session reset (#29177).
|
||||
try:
|
||||
from tools.approval import get_current_session_key
|
||||
session_key = get_current_session_key(default="") or ""
|
||||
except Exception:
|
||||
session_key = ""
|
||||
return json.dumps(
|
||||
{"processes": process_registry.list_sessions(task_id=task_id, session_key=session_key or None)},
|
||||
ensure_ascii=False,
|
||||
)
|
||||
elif action in {"poll", "log", "wait", "kill", "write", "submit", "close"}:
|
||||
if not session_id:
|
||||
return tool_error(f"session_id is required for {action}")
|
||||
|
|
|
|||
|
|
@ -198,6 +198,15 @@ Sessions reset based on configurable policies:
|
|||
| Idle | 1440 min | Reset after N minutes of inactivity |
|
||||
| Both | (combined) | Whichever triggers first |
|
||||
|
||||
A live background process (started with `terminal(background=true)`) normally
|
||||
protects its session from resetting so output isn't lost. To stop a forgotten
|
||||
process — say a preview server — from pinning a session open forever, a
|
||||
background process older than `bg_process_max_age_hours` (default **24**) no
|
||||
longer blocks reset. The process is **not** killed, only ignored by the reset
|
||||
guard. Set it to `0` to disable the cutoff (any live process blocks reset, the
|
||||
old behavior), or raise it if you run legitimate multi-day jobs whose liveness
|
||||
should keep the conversation open.
|
||||
|
||||
Configure per-platform overrides in `~/.hermes/gateway.json`:
|
||||
|
||||
```json
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue