fix(gateway): make bg-process reset TTL configurable + surface session-scoped processes

Follow-up to the cherry-picked #29212 (#29177):

- Promote the 24h stale-process threshold to config.yaml
  (session_reset.bg_process_max_age_hours) instead of a hardcoded
  constant. 0 disables the cutoff (legacy: any live process blocks reset).
  Wired through GatewayConfig.default_reset_policy in gateway/run.py.
- Bug 2: process(action=list) now resolves the gateway session_key from
  the contextvar and surfaces session-scoped background processes (a
  forgotten preview server under a different task), flagged
  session_scoped — so the agent/user can discover and kill the blocker.
  Previously the task-scoped list returned [] and the blocker was invisible.
- Tests: config round-trip for the new field, cross-task list visibility.
- Docs: messaging session-reset section.
This commit is contained in:
teknium1 2026-06-27 19:53:29 -07:00 committed by Teknium
parent 33d8b66d5b
commit a1ac6baac4
6 changed files with 101 additions and 13 deletions

View file

@ -288,7 +288,13 @@ class SessionResetPolicy:
idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours)
notify: bool = True # Send a notification to the user when auto-reset occurs
notify_exclude_platforms: tuple = ("api_server", "webhook") # Platforms that don't get reset notifications
# A background process this many hours old (or older) no longer blocks
# session idle/daily reset. A forgotten preview server should not keep a
# session alive forever (#29177). The process is NOT killed — only ignored
# by the reset guard. Raise this if you run legitimate multi-day jobs whose
# liveness should pin the conversation open.
bg_process_max_age_hours: int = 24
def to_dict(self) -> Dict[str, Any]:
return {
"mode": self.mode,
@ -296,6 +302,7 @@ class SessionResetPolicy:
"idle_minutes": self.idle_minutes,
"notify": self.notify,
"notify_exclude_platforms": list(self.notify_exclude_platforms),
"bg_process_max_age_hours": self.bg_process_max_age_hours,
}
@classmethod
@ -306,12 +313,14 @@ class SessionResetPolicy:
idle_minutes = data.get("idle_minutes")
notify = data.get("notify")
exclude = data.get("notify_exclude_platforms")
bg_max_age = data.get("bg_process_max_age_hours")
return cls(
mode=mode if mode is not None else "both",
at_hour=at_hour if at_hour is not None else 4,
idle_minutes=idle_minutes if idle_minutes is not None else 1440,
notify=_coerce_bool(notify, True),
notify_exclude_platforms=tuple(exclude) if exclude is not None else ("api_server", "webhook"),
bg_process_max_age_hours=bg_max_age if bg_max_age is not None else 24,
)

View file

@ -2661,13 +2661,21 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._fallback_model = self._load_fallback_model()
# Wire process registry into session store for reset protection.
# Processes older than MAX_ACTIVE_PROCESS_AGE (24h) are treated as
# stale and no longer block session idle / daily reset — see #29177.
from tools.process_registry import MAX_ACTIVE_PROCESS_AGE, process_registry
# A background process older than the configured threshold (default 24h,
# session_reset.bg_process_max_age_hours) is treated as stale and no
# longer blocks session idle / daily reset — see #29177. The process is
# NOT killed, only ignored by the reset guard.
from tools.process_registry import process_registry
_bg_max_age_hours = getattr(
self.config.default_reset_policy, "bg_process_max_age_hours", 24
)
_bg_max_age_seconds = (
_bg_max_age_hours * 3600 if _bg_max_age_hours and _bg_max_age_hours > 0 else None
)
self.session_store = SessionStore(
self.config.sessions_dir, self.config,
has_active_processes_fn=lambda key: process_registry.has_active_for_session(
key, max_active_age=MAX_ACTIVE_PROCESS_AGE,
key, max_active_age=_bg_max_age_seconds,
),
)
self.delivery_router = DeliveryRouter(self.config)

View file

@ -138,26 +138,31 @@ class TestGetConnectedPlatforms:
class TestSessionResetPolicy:
def test_roundtrip(self):
policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120)
policy = SessionResetPolicy(mode="idle", at_hour=6, idle_minutes=120,
bg_process_max_age_hours=48)
d = policy.to_dict()
restored = SessionResetPolicy.from_dict(d)
assert restored.mode == "idle"
assert restored.at_hour == 6
assert restored.idle_minutes == 120
assert restored.bg_process_max_age_hours == 48
def test_defaults(self):
policy = SessionResetPolicy()
assert policy.mode == "both"
assert policy.at_hour == 4
assert policy.idle_minutes == 1440
assert policy.bg_process_max_age_hours == 24
def test_from_dict_treats_null_values_as_defaults(self):
restored = SessionResetPolicy.from_dict(
{"mode": None, "at_hour": None, "idle_minutes": None}
{"mode": None, "at_hour": None, "idle_minutes": None,
"bg_process_max_age_hours": None}
)
assert restored.mode == "both"
assert restored.at_hour == 4
assert restored.idle_minutes == 1440
assert restored.bg_process_max_age_hours == 24
def test_from_dict_coerces_quoted_false_notify(self):
restored = SessionResetPolicy.from_dict({"notify": "false"})

View file

@ -416,6 +416,34 @@ class TestListSessions:
assert len(result) == 1
assert result[0]["session_id"] == "proc_1"
def test_session_key_surfaces_cross_task_processes(self, registry):
"""A bg process under the same gateway session but a DIFFERENT task is
surfaced when session_key is passed, and flagged session_scoped (#29177).
"""
# Current turn's task = "t_now"; forgotten preview server = "t_old"
# but both share gateway session_key "gw1".
own = _make_session(sid="proc_own", task_id="t_now")
own.session_key = "gw1"
forgotten = _make_session(sid="proc_forgotten", task_id="t_old")
forgotten.session_key = "gw1"
other = _make_session(sid="proc_other", task_id="t_x")
other.session_key = "gw_other"
registry._running[own.id] = own
registry._running[forgotten.id] = forgotten
registry._running[other.id] = other
# Task-only (legacy) view sees just the current task's process.
legacy = registry.list_sessions(task_id="t_now")
assert {r["session_id"] for r in legacy} == {"proc_own"}
# With session_key, the forgotten process under the same gateway
# session is surfaced and flagged; the unrelated session is not.
result = registry.list_sessions(task_id="t_now", session_key="gw1")
by_id = {r["session_id"]: r for r in result}
assert set(by_id) == {"proc_own", "proc_forgotten"}
assert by_id["proc_forgotten"].get("session_scoped") is True
assert "session_scoped" not in by_id["proc_own"]
def test_list_entry_fields(self, registry):
s = _make_session(output="preview text")
registry._running[s.id] = s

View file

@ -58,7 +58,7 @@ CHECKPOINT_PATH = get_hermes_home() / "processes.json"
MAX_OUTPUT_CHARS = 200_000 # 200KB rolling output buffer
FINISHED_TTL_SECONDS = 1800 # Keep finished processes for 30 minutes
MAX_PROCESSES = 64 # Max concurrent tracked processes (LRU pruning)
MAX_ACTIVE_PROCESS_AGE = 86400 # 24h — stale processes no longer block session reset
MAX_ACTIVE_PROCESS_AGE = 86400 # 24h default — see session_reset.bg_process_max_age_hours (#29177)
# Watch pattern rate limiting — PER SESSION.
# Hard rule: at most ONE watch-match notification every WATCH_MIN_INTERVAL_SECONDS.
@ -1515,15 +1515,28 @@ class ProcessRegistry:
except Exception:
return 0
def list_sessions(self, task_id: str = None) -> list:
"""List all running and recently-finished processes."""
def list_sessions(self, task_id: str = None, session_key: str = None) -> list:
"""List all running and recently-finished processes.
When ``task_id`` is given, processes for that task are included. When
``session_key`` is also given, session-scoped background processes
(``background: true``) registered under that gateway session are
surfaced too, even if they belong to a different task so the agent
can discover a forgotten preview server that is blocking session
reset (#29177). Such cross-task entries are flagged with
``"session_scoped": true``.
"""
with self._lock:
all_sessions = list(self._running.values()) + list(self._finished.values())
all_sessions = [self._refresh_detached_session(s) for s in all_sessions]
if task_id:
all_sessions = [s for s in all_sessions if s.task_id == task_id]
if task_id or session_key:
all_sessions = [
s for s in all_sessions
if (task_id and s.task_id == task_id)
or (session_key and s.session_key == session_key)
]
result = []
for s in all_sessions:
@ -1537,6 +1550,11 @@ class ProcessRegistry:
"status": "exited" if s.exited else "running",
"output_preview": s.output_buffer[-200:] if s.output_buffer else "",
}
# Flag processes surfaced only because they share the gateway
# session (not the current task) — these are the long-lived
# background processes a user may have forgotten about (#29177).
if task_id and session_key and s.task_id != task_id and s.session_key == session_key:
entry["session_scoped"] = True
# Trigger metadata so a goal-loop judge can decide to wait on this
# process's OWN signal (a watch-pattern match or completion), not
# just its exit. A watcher with watch_patterns may never exit.
@ -2070,7 +2088,18 @@ def _handle_process(args, **kw):
session_id = str(args.get("session_id", "")) if args.get("session_id") is not None else ""
if action == "list":
return json.dumps({"processes": process_registry.list_sessions(task_id=task_id)}, ensure_ascii=False)
# Surface session-scoped background processes (e.g. a forgotten
# preview server) in addition to this task's own — they share the
# gateway session_key and can block session reset (#29177).
try:
from tools.approval import get_current_session_key
session_key = get_current_session_key(default="") or ""
except Exception:
session_key = ""
return json.dumps(
{"processes": process_registry.list_sessions(task_id=task_id, session_key=session_key or None)},
ensure_ascii=False,
)
elif action in {"poll", "log", "wait", "kill", "write", "submit", "close"}:
if not session_id:
return tool_error(f"session_id is required for {action}")

View file

@ -198,6 +198,15 @@ Sessions reset based on configurable policies:
| Idle | 1440 min | Reset after N minutes of inactivity |
| Both | (combined) | Whichever triggers first |
A live background process (started with `terminal(background=true)`) normally
protects its session from resetting so output isn't lost. To stop a forgotten
process — say a preview server — from pinning a session open forever, a
background process older than `bg_process_max_age_hours` (default **24**) no
longer blocks reset. The process is **not** killed, only ignored by the reset
guard. Set it to `0` to disable the cutoff (any live process blocks reset, the
old behavior), or raise it if you run legitimate multi-day jobs whose liveness
should keep the conversation open.
Configure per-platform overrides in `~/.hermes/gateway.json`:
```json