fix: stale background processes no longer permanently block session reset

Background processes (e.g. http.server preview) that Hermes starts and
forgets about previously blocked session idle/daily reset indefinitely.
The reset guard in session.py checked has_active_for_session() with no
max age — a 3-day-old preview server blocked reset the same as a task
started 30 seconds ago.

Changes:
- Add max_active_age parameter to has_active_for_session() in
  process_registry.py. Processes older than this threshold are ignored.
- Add MAX_ACTIVE_PROCESS_AGE constant (24h / 86400s).
- Wire max_active_age into the gateway's session store callback in
  run.py so stale processes no longer block session lifecycle.
- Add debug logging when reset is skipped due to active processes.
- Add 3 tests covering recent, stale, and legacy (None) max age.

Fixes #29177
This commit is contained in:
annguyenNous 2026-05-20 16:09:45 +07:00 committed by Teknium
parent 8c8967a50b
commit 33d8b66d5b
4 changed files with 59 additions and 6 deletions

View file

@ -2660,11 +2660,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._provider_routing = self._load_provider_routing()
self._fallback_model = self._load_fallback_model()
# Wire process registry into session store for reset protection
from tools.process_registry import process_registry
# Wire process registry into session store for reset protection.
# Processes older than MAX_ACTIVE_PROCESS_AGE (24h) are treated as
# stale and no longer block session idle / daily reset — see #29177.
from tools.process_registry import MAX_ACTIVE_PROCESS_AGE, process_registry
self.session_store = SessionStore(
self.config.sessions_dir, self.config,
has_active_processes_fn=lambda key: process_registry.has_active_for_session(key),
has_active_processes_fn=lambda key: process_registry.has_active_for_session(
key, max_active_age=MAX_ACTIVE_PROCESS_AGE,
),
)
self.delivery_router = DeliveryRouter(self.config)
self._running = False

View file

@ -924,6 +924,10 @@ class SessionStore:
"""
if self._has_active_processes_fn:
if self._has_active_processes_fn(entry.session_key):
logger.debug(
"Session %s not expired — active background processes",
entry.session_key,
)
return False
policy = self.config.get_reset_policy(
@ -965,6 +969,10 @@ class SessionStore:
if self._has_active_processes_fn:
session_key = self._generate_session_key(source)
if self._has_active_processes_fn(session_key):
logger.debug(
"Session reset skipped for %s — active background processes",
session_key,
)
return None
policy = self.config.get_reset_policy(

View file

@ -16,6 +16,7 @@ from tools.process_registry import (
ProcessSession,
FINISHED_TTL_SECONDS,
MAX_PROCESSES,
MAX_ACTIVE_PROCESS_AGE,
)
@ -444,6 +445,27 @@ class TestActiveQueries:
assert registry.has_active_for_session("gw_session_1") is True
assert registry.has_active_for_session("other") is False
def test_has_active_for_session_with_max_age_recent(self, registry):
"""Recent process is considered active when max_active_age is set."""
s = _make_session(started_at=time.time() - 100)
s.session_key = "gw_session_1"
registry._running[s.id] = s
assert registry.has_active_for_session("gw_session_1", max_active_age=3600) is True
def test_has_active_for_session_with_max_age_stale(self, registry):
"""Stale process (older than max_active_age) is ignored."""
s = _make_session(started_at=time.time() - 90000) # 25 hours ago
s.session_key = "gw_session_1"
registry._running[s.id] = s
assert registry.has_active_for_session("gw_session_1", max_active_age=86400) is False
def test_has_active_for_session_max_age_none_preserves_legacy(self, registry):
"""Without max_active_age, any running process blocks (legacy behaviour)."""
s = _make_session(started_at=time.time() - 90000) # 25 hours ago
s.session_key = "gw_session_1"
registry._running[s.id] = s
assert registry.has_active_for_session("gw_session_1") is True
def test_exited_not_active(self, registry):
s = _make_session(task_id="t1", exited=True, exit_code=0)
registry._finished[s.id] = s

View file

@ -58,6 +58,7 @@ CHECKPOINT_PATH = get_hermes_home() / "processes.json"
MAX_OUTPUT_CHARS = 200_000 # 200KB rolling output buffer
FINISHED_TTL_SECONDS = 1800 # Keep finished processes for 30 minutes
MAX_PROCESSES = 64 # Max concurrent tracked processes (LRU pruning)
MAX_ACTIVE_PROCESS_AGE = 86400 # 24h — stale processes no longer block session reset
# Watch pattern rate limiting — PER SESSION.
# Hard rule: at most ONE watch-match notification every WATCH_MIN_INTERVAL_SECONDS.
@ -1567,17 +1568,35 @@ class ProcessRegistry:
for s in self._running.values()
)
def has_active_for_session(self, session_key: str) -> bool:
"""Check if there are active processes for a gateway session key."""
def has_active_for_session(
self, session_key: str, max_active_age: Optional[float] = None,
) -> bool:
"""Check if there are active processes for a gateway session key.
When *max_active_age* is set (seconds), processes that started more
than that many seconds ago are **ignored** they are still running
but are considered stale and must not block session idle / daily
reset. This prevents a forgotten ``http.server`` (or any long-lived
preview process) from permanently freezing the session lifecycle.
Args:
session_key: Gateway session key to check.
max_active_age: If set, ignore processes older than this many
seconds. ``None`` retains the legacy behaviour (any running
process blocks).
"""
with self._lock:
sessions = list(self._running.values())
for session in sessions:
self._refresh_detached_session(session)
now = time.time()
with self._lock:
return any(
s.session_key == session_key and not s.exited
s.session_key == session_key
and not s.exited
and (max_active_age is None or (now - s.started_at) < max_active_age)
for s in self._running.values()
)