From a1ac6baac45dbe2a23c6ffeb87d43ceda933c4fc Mon Sep 17 00:00:00 2001 From: teknium1 <127238744+teknium1@users.noreply.github.com> Date: Sat, 27 Jun 2026 19:53:29 -0700 Subject: [PATCH] fix(gateway): make bg-process reset TTL configurable + surface session-scoped processes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the cherry-picked #29212 (#29177): - Promote the 24h stale-process threshold to config.yaml (session_reset.bg_process_max_age_hours) instead of a hardcoded constant. 0 disables the cutoff (legacy: any live process blocks reset). Wired through GatewayConfig.default_reset_policy in gateway/run.py. - Bug 2: process(action=list) now resolves the gateway session_key from the contextvar and surfaces session-scoped background processes (a forgotten preview server under a different task), flagged session_scoped — so the agent/user can discover and kill the blocker. Previously the task-scoped list returned [] and the blocker was invisible. - Tests: config round-trip for the new field, cross-task list visibility. - Docs: messaging session-reset section. --- gateway/config.py | 11 +++++- gateway/run.py | 16 ++++++--- tests/gateway/test_config.py | 9 +++-- tests/tools/test_process_registry.py | 28 +++++++++++++++ tools/process_registry.py | 41 ++++++++++++++++++---- website/docs/user-guide/messaging/index.md | 9 +++++ 6 files changed, 101 insertions(+), 13 deletions(-) diff --git a/gateway/config.py b/gateway/config.py index e1556b37d52..cdf895c4ee2 100644 --- a/gateway/config.py +++ b/gateway/config.py @@ -288,7 +288,13 @@ class SessionResetPolicy: idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours) notify: bool = True # Send a notification to the user when auto-reset occurs notify_exclude_platforms: tuple = ("api_server", "webhook") # Platforms that don't get reset notifications - + # A background process this many hours old (or older) no longer blocks + # session idle/daily reset. A forgotten preview server should not keep a + # session alive forever (#29177). The process is NOT killed — only ignored + # by the reset guard. Raise this if you run legitimate multi-day jobs whose + # liveness should pin the conversation open. + bg_process_max_age_hours: int = 24 + def to_dict(self) -> Dict[str, Any]: return { "mode": self.mode, @@ -296,6 +302,7 @@ class SessionResetPolicy: "idle_minutes": self.idle_minutes, "notify": self.notify, "notify_exclude_platforms": list(self.notify_exclude_platforms), + "bg_process_max_age_hours": self.bg_process_max_age_hours, } @classmethod @@ -306,12 +313,14 @@ class SessionResetPolicy: idle_minutes = data.get("idle_minutes") notify = data.get("notify") exclude = data.get("notify_exclude_platforms") + bg_max_age = data.get("bg_process_max_age_hours") return cls( mode=mode if mode is not None else "both", at_hour=at_hour if at_hour is not None else 4, idle_minutes=idle_minutes if idle_minutes is not None else 1440, notify=_coerce_bool(notify, True), notify_exclude_platforms=tuple(exclude) if exclude is not None else ("api_server", "webhook"), + bg_process_max_age_hours=bg_max_age if bg_max_age is not None else 24, ) diff --git a/gateway/run.py b/gateway/run.py index acffb72edee..72ba4e72341 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2661,13 +2661,21 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._fallback_model = self._load_fallback_model() # Wire process registry into session store for reset protection. - # Processes older than MAX_ACTIVE_PROCESS_AGE (24h) are treated as - # stale and no longer block session idle / daily reset — see #29177. - from tools.process_registry import MAX_ACTIVE_PROCESS_AGE, process_registry + # A background process older than the configured threshold (default 24h, + # session_reset.bg_process_max_age_hours) is treated as stale and no + # longer blocks session idle / daily reset — see #29177. The process is + # NOT killed, only ignored by the reset guard. + from tools.process_registry import process_registry + _bg_max_age_hours = getattr( + self.config.default_reset_policy, "bg_process_max_age_hours", 24 + ) + _bg_max_age_seconds = ( + _bg_max_age_hours * 3600 if _bg_max_age_hours and _bg_max_age_hours > 0 else None + ) self.session_store = SessionStore( self.config.sessions_dir, self.config, has_active_processes_fn=lambda key: process_registry.has_active_for_session( - key, max_active_age=MAX_ACTIVE_PROCESS_AGE, + key, max_active_age=_bg_max_age_seconds, ), ) self.delivery_router = DeliveryRouter(self.config) diff --git a/tests/gateway/test_config.py b/tests/gateway/test_config.py index 79bccc100ca..a4a4ffade06 100644 --- a/tests/gateway/test_config.py +++ b/tests/gateway/test_config.py @@ -138,26 +138,31 @@ class TestGetConnectedPlatforms: class TestSessionResetPolicy: def test_roundtrip(self): - policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120) + policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120, + bg_process_max_age_hours=48) d = policy.to_dict() restored = SessionResetPolicy.from_dict(d) assert restored.mode == "idle" assert restored.at_hour == 6 assert restored.idle_minutes == 120 + assert restored.bg_process_max_age_hours == 48 def test_defaults(self): policy = SessionResetPolicy() assert policy.mode == "both" assert policy.at_hour == 4 assert policy.idle_minutes == 1440 + assert policy.bg_process_max_age_hours == 24 def test_from_dict_treats_null_values_as_defaults(self): restored = SessionResetPolicy.from_dict( - {"mode": None, "at_hour": None, "idle_minutes": None} + {"mode": None, "at_hour": None, "idle_minutes": None, + "bg_process_max_age_hours": None} ) assert restored.mode == "both" assert restored.at_hour == 4 assert restored.idle_minutes == 1440 + assert restored.bg_process_max_age_hours == 24 def test_from_dict_coerces_quoted_false_notify(self): restored = SessionResetPolicy.from_dict({"notify": "false"}) diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index b1f22cb0b3e..3dd40aec421 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -416,6 +416,34 @@ class TestListSessions: assert len(result) == 1 assert result[0]["session_id"] == "proc_1" + def test_session_key_surfaces_cross_task_processes(self, registry): + """A bg process under the same gateway session but a DIFFERENT task is + surfaced when session_key is passed, and flagged session_scoped (#29177). + """ + # Current turn's task = "t_now"; forgotten preview server = "t_old" + # but both share gateway session_key "gw1". + own = _make_session(sid="proc_own", task_id="t_now") + own.session_key = "gw1" + forgotten = _make_session(sid="proc_forgotten", task_id="t_old") + forgotten.session_key = "gw1" + other = _make_session(sid="proc_other", task_id="t_x") + other.session_key = "gw_other" + registry._running[own.id] = own + registry._running[forgotten.id] = forgotten + registry._running[other.id] = other + + # Task-only (legacy) view sees just the current task's process. + legacy = registry.list_sessions(task_id="t_now") + assert {r["session_id"] for r in legacy} == {"proc_own"} + + # With session_key, the forgotten process under the same gateway + # session is surfaced and flagged; the unrelated session is not. + result = registry.list_sessions(task_id="t_now", session_key="gw1") + by_id = {r["session_id"]: r for r in result} + assert set(by_id) == {"proc_own", "proc_forgotten"} + assert by_id["proc_forgotten"].get("session_scoped") is True + assert "session_scoped" not in by_id["proc_own"] + def test_list_entry_fields(self, registry): s = _make_session(output="preview text") registry._running[s.id] = s diff --git a/tools/process_registry.py b/tools/process_registry.py index 5f74add8120..6d966c14e34 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -58,7 +58,7 @@ CHECKPOINT_PATH = get_hermes_home() / "processes.json" MAX_OUTPUT_CHARS = 200_000 # 200KB rolling output buffer FINISHED_TTL_SECONDS = 1800 # Keep finished processes for 30 minutes MAX_PROCESSES = 64 # Max concurrent tracked processes (LRU pruning) -MAX_ACTIVE_PROCESS_AGE = 86400 # 24h — stale processes no longer block session reset +MAX_ACTIVE_PROCESS_AGE = 86400 # 24h default — see session_reset.bg_process_max_age_hours (#29177) # Watch pattern rate limiting — PER SESSION. # Hard rule: at most ONE watch-match notification every WATCH_MIN_INTERVAL_SECONDS. @@ -1515,15 +1515,28 @@ class ProcessRegistry: except Exception: return 0 - def list_sessions(self, task_id: str = None) -> list: - """List all running and recently-finished processes.""" + def list_sessions(self, task_id: str = None, session_key: str = None) -> list: + """List all running and recently-finished processes. + + When ``task_id`` is given, processes for that task are included. When + ``session_key`` is also given, session-scoped background processes + (``background: true``) registered under that gateway session are + surfaced too, even if they belong to a different task — so the agent + can discover a forgotten preview server that is blocking session + reset (#29177). Such cross-task entries are flagged with + ``"session_scoped": true``. + """ with self._lock: all_sessions = list(self._running.values()) + list(self._finished.values()) all_sessions = [self._refresh_detached_session(s) for s in all_sessions] - if task_id: - all_sessions = [s for s in all_sessions if s.task_id == task_id] + if task_id or session_key: + all_sessions = [ + s for s in all_sessions + if (task_id and s.task_id == task_id) + or (session_key and s.session_key == session_key) + ] result = [] for s in all_sessions: @@ -1537,6 +1550,11 @@ class ProcessRegistry: "status": "exited" if s.exited else "running", "output_preview": s.output_buffer[-200:] if s.output_buffer else "", } + # Flag processes surfaced only because they share the gateway + # session (not the current task) — these are the long-lived + # background processes a user may have forgotten about (#29177). + if task_id and session_key and s.task_id != task_id and s.session_key == session_key: + entry["session_scoped"] = True # Trigger metadata so a goal-loop judge can decide to wait on this # process's OWN signal (a watch-pattern match or completion), not # just its exit. A watcher with watch_patterns may never exit. @@ -2070,7 +2088,18 @@ def _handle_process(args, **kw): session_id = str(args.get("session_id", "")) if args.get("session_id") is not None else "" if action == "list": - return json.dumps({"processes": process_registry.list_sessions(task_id=task_id)}, ensure_ascii=False) + # Surface session-scoped background processes (e.g. a forgotten + # preview server) in addition to this task's own — they share the + # gateway session_key and can block session reset (#29177). + try: + from tools.approval import get_current_session_key + session_key = get_current_session_key(default="") or "" + except Exception: + session_key = "" + return json.dumps( + {"processes": process_registry.list_sessions(task_id=task_id, session_key=session_key or None)}, + ensure_ascii=False, + ) elif action in {"poll", "log", "wait", "kill", "write", "submit", "close"}: if not session_id: return tool_error(f"session_id is required for {action}") diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md index 0e91be6e100..17f9331c65e 100644 --- a/website/docs/user-guide/messaging/index.md +++ b/website/docs/user-guide/messaging/index.md @@ -198,6 +198,15 @@ Sessions reset based on configurable policies: | Idle | 1440 min | Reset after N minutes of inactivity | | Both | (combined) | Whichever triggers first | +A live background process (started with `terminal(background=true)`) normally +protects its session from resetting so output isn't lost. To stop a forgotten +process — say a preview server — from pinning a session open forever, a +background process older than `bg_process_max_age_hours` (default **24**) no +longer blocks reset. The process is **not** killed, only ignored by the reset +guard. Set it to `0` to disable the cutoff (any live process blocks reset, the +old behavior), or raise it if you run legitimate multi-day jobs whose liveness +should keep the conversation open. + Configure per-platform overrides in `~/.hermes/gateway.json`: ```json