From 33d8b66d5bf591db618a24b80c83f3f2cac39200 Mon Sep 17 00:00:00 2001 From: annguyenNous Date: Wed, 20 May 2026 16:09:45 +0700 Subject: [PATCH] fix: stale background processes no longer permanently block session reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Background processes (e.g. http.server preview) that Hermes starts and forgets about previously blocked session idle/daily reset indefinitely. The reset guard in session.py checked has_active_for_session() with no max age — a 3-day-old preview server blocked reset the same as a task started 30 seconds ago. Changes: - Add max_active_age parameter to has_active_for_session() in process_registry.py. Processes older than this threshold are ignored. - Add MAX_ACTIVE_PROCESS_AGE constant (24h / 86400s). - Wire max_active_age into the gateway's session store callback in run.py so stale processes no longer block session lifecycle. - Add debug logging when reset is skipped due to active processes. - Add 3 tests covering recent, stale, and legacy (None) max age. Fixes #29177 --- gateway/run.py | 10 +++++++--- gateway/session.py | 8 ++++++++ tests/tools/test_process_registry.py | 22 ++++++++++++++++++++++ tools/process_registry.py | 25 ++++++++++++++++++++++--- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index a60de0daa35..acffb72edee 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2660,11 +2660,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._provider_routing = self._load_provider_routing() self._fallback_model = self._load_fallback_model() - # Wire process registry into session store for reset protection - from tools.process_registry import process_registry + # Wire process registry into session store for reset protection. + # Processes older than MAX_ACTIVE_PROCESS_AGE (24h) are treated as + # stale and no longer block session idle / daily reset — see #29177. + from tools.process_registry import MAX_ACTIVE_PROCESS_AGE, process_registry self.session_store = SessionStore( self.config.sessions_dir, self.config, - has_active_processes_fn=lambda key: process_registry.has_active_for_session(key), + has_active_processes_fn=lambda key: process_registry.has_active_for_session( + key, max_active_age=MAX_ACTIVE_PROCESS_AGE, + ), ) self.delivery_router = DeliveryRouter(self.config) self._running = False diff --git a/gateway/session.py b/gateway/session.py index f79e371d804..1c6a23a4c3b 100644 --- a/gateway/session.py +++ b/gateway/session.py @@ -924,6 +924,10 @@ class SessionStore: """ if self._has_active_processes_fn: if self._has_active_processes_fn(entry.session_key): + logger.debug( + "Session %s not expired — active background processes", + entry.session_key, + ) return False policy = self.config.get_reset_policy( @@ -965,6 +969,10 @@ class SessionStore: if self._has_active_processes_fn: session_key = self._generate_session_key(source) if self._has_active_processes_fn(session_key): + logger.debug( + "Session reset skipped for %s — active background processes", + session_key, + ) return None policy = self.config.get_reset_policy( diff --git a/tests/tools/test_process_registry.py b/tests/tools/test_process_registry.py index 659ef2e21e8..b1f22cb0b3e 100644 --- a/tests/tools/test_process_registry.py +++ b/tests/tools/test_process_registry.py @@ -16,6 +16,7 @@ from tools.process_registry import ( ProcessSession, FINISHED_TTL_SECONDS, MAX_PROCESSES, + MAX_ACTIVE_PROCESS_AGE, ) @@ -444,6 +445,27 @@ class TestActiveQueries: assert registry.has_active_for_session("gw_session_1") is True assert registry.has_active_for_session("other") is False + def test_has_active_for_session_with_max_age_recent(self, registry): + """Recent process is considered active when max_active_age is set.""" + s = _make_session(started_at=time.time() - 100) + s.session_key = "gw_session_1" + registry._running[s.id] = s + assert registry.has_active_for_session("gw_session_1", max_active_age=3600) is True + + def test_has_active_for_session_with_max_age_stale(self, registry): + """Stale process (older than max_active_age) is ignored.""" + s = _make_session(started_at=time.time() - 90000) # 25 hours ago + s.session_key = "gw_session_1" + registry._running[s.id] = s + assert registry.has_active_for_session("gw_session_1", max_active_age=86400) is False + + def test_has_active_for_session_max_age_none_preserves_legacy(self, registry): + """Without max_active_age, any running process blocks (legacy behaviour).""" + s = _make_session(started_at=time.time() - 90000) # 25 hours ago + s.session_key = "gw_session_1" + registry._running[s.id] = s + assert registry.has_active_for_session("gw_session_1") is True + def test_exited_not_active(self, registry): s = _make_session(task_id="t1", exited=True, exit_code=0) registry._finished[s.id] = s diff --git a/tools/process_registry.py b/tools/process_registry.py index e21c68af993..5f74add8120 100644 --- a/tools/process_registry.py +++ b/tools/process_registry.py @@ -58,6 +58,7 @@ CHECKPOINT_PATH = get_hermes_home() / "processes.json" MAX_OUTPUT_CHARS = 200_000 # 200KB rolling output buffer FINISHED_TTL_SECONDS = 1800 # Keep finished processes for 30 minutes MAX_PROCESSES = 64 # Max concurrent tracked processes (LRU pruning) +MAX_ACTIVE_PROCESS_AGE = 86400 # 24h — stale processes no longer block session reset # Watch pattern rate limiting — PER SESSION. # Hard rule: at most ONE watch-match notification every WATCH_MIN_INTERVAL_SECONDS. @@ -1567,17 +1568,35 @@ class ProcessRegistry: for s in self._running.values() ) - def has_active_for_session(self, session_key: str) -> bool: - """Check if there are active processes for a gateway session key.""" + def has_active_for_session( + self, session_key: str, max_active_age: Optional[float] = None, + ) -> bool: + """Check if there are active processes for a gateway session key. + + When *max_active_age* is set (seconds), processes that started more + than that many seconds ago are **ignored** — they are still running + but are considered stale and must not block session idle / daily + reset. This prevents a forgotten ``http.server`` (or any long-lived + preview process) from permanently freezing the session lifecycle. + + Args: + session_key: Gateway session key to check. + max_active_age: If set, ignore processes older than this many + seconds. ``None`` retains the legacy behaviour (any running + process blocks). + """ with self._lock: sessions = list(self._running.values()) for session in sessions: self._refresh_detached_session(session) + now = time.time() with self._lock: return any( - s.session_key == session_key and not s.exited + s.session_key == session_key + and not s.exited + and (max_active_age is None or (now - s.started_at) < max_active_age) for s in self._running.values() )