Reconnect dashboard PTY chat after socket drops

This commit is contained in:
Shannon Sands 2026-06-26 16:15:22 +10:00 committed by Teknium
parent 6a319f570f
commit 41f8126148
3 changed files with 220 additions and 11 deletions

View file

@ -167,6 +167,7 @@ def _resolve_restart_drain_timeout() -> float:
async def _lifespan(app: "FastAPI"):
app.state.event_channels = {} # dict[str, set]
app.state.event_lock = asyncio.Lock()
app.state.pty_active_session_files = {} # dict[str, Path]
# Serializes chat-argv resolution so concurrent /api/pty connections
# don't trigger overlapping ``npm install`` / ``npm run build`` work.
# On app.state (not a module global) so the Lock binds to the running
@ -234,6 +235,15 @@ def _get_chat_argv_lock(app: "FastAPI") -> asyncio.Lock:
return app.state.chat_argv_lock
def _get_pty_active_session_files(app: "FastAPI") -> dict[str, Path]:
"""Return channel -> active-session-file state for dashboard PTYs."""
try:
return app.state.pty_active_session_files
except AttributeError:
app.state.pty_active_session_files = {}
return app.state.pty_active_session_files
app = FastAPI(title="Hermes Agent", version=__version__, lifespan=_lifespan)
# Memory-provider OAuth connect routes live in the memory layer, not here.
@ -11544,6 +11554,7 @@ def _resolve_chat_argv(
resume: Optional[str] = None,
sidecar_url: Optional[str] = None,
profile: Optional[str] = None,
active_session_file: Optional[str] = None,
) -> tuple[list[str], Optional[str], Optional[dict]]:
"""Resolve the argv + cwd + env for the chat PTY.
@ -11564,6 +11575,12 @@ def _resolve_chat_argv(
the spawned ``tui_gateway.entry`` can mirror dispatcher emits to the
dashboard's ``/api/pub`` endpoint (see :func:`pub_ws`).
`active_session_file` (when set) is forwarded as
``HERMES_TUI_ACTIVE_SESSION_FILE``. The TUI writes the current session id
there whenever it creates/resumes/switches sessions, giving the dashboard a
small cross-process breadcrumb for reconnecting after an unexpected browser
WebSocket close.
`profile` (when set) scopes the ENTIRE chat to that profile by pointing
``HERMES_HOME`` at the profile dir in the child env. Every spawned
process (the TUI and the ``tui_gateway.entry`` it launches) resolves
@ -11611,6 +11628,9 @@ def _resolve_chat_argv(
if sidecar_url:
env["HERMES_TUI_SIDECAR_URL"] = sidecar_url
if active_session_file:
env["HERMES_TUI_ACTIVE_SESSION_FILE"] = active_session_file
# Profile-scoped chats must NOT attach to the dashboard's in-memory
# gateway — it runs under the dashboard's own profile. Without the
# attach URL, gatewayClient spawns its own `tui_gateway.entry`, which
@ -11659,6 +11679,7 @@ async def _resolve_chat_argv_async(
resume: Optional[str] = None,
sidecar_url: Optional[str] = None,
profile: Optional[str] = None,
active_session_file: Optional[str] = None,
) -> tuple[list[str], Optional[str], Optional[dict]]:
"""Resolve chat argv without blocking the dashboard event loop.
@ -11670,12 +11691,18 @@ async def _resolve_chat_argv_async(
multiple browser tabs connect at once without occupying worker threads
while queued connections wait.
"""
kwargs = {
"resume": resume,
"sidecar_url": sidecar_url,
"profile": profile,
}
if active_session_file is not None:
kwargs["active_session_file"] = active_session_file
async with _get_chat_argv_lock(app):
return await asyncio.to_thread(
_resolve_chat_argv,
resume=resume,
sidecar_url=sidecar_url,
profile=profile,
**kwargs,
)
@ -11737,6 +11764,37 @@ def _channel_or_close_code(ws: WebSocket) -> Optional[str]:
return channel if _VALID_CHANNEL_RE.match(channel) else None
def _active_session_file_for_channel(app: "FastAPI", channel: str) -> Path:
"""Return the per-channel file where a dashboard TUI writes its active sid."""
files = _get_pty_active_session_files(app)
existing = files.get(channel)
if existing is not None:
return existing
fd, raw_path = tempfile.mkstemp(prefix="hermes-pty-active-", suffix=".json")
os.close(fd)
path = Path(raw_path)
files[channel] = path
return path
def _read_active_session_file(path: Path) -> Optional[str]:
try:
data = json.loads(path.read_text(encoding="utf-8"))
except (OSError, json.JSONDecodeError):
return None
session_id = str(data.get("session_id") or "").strip()
return session_id or None
def _forget_active_session_file(path: Path) -> None:
try:
path.unlink(missing_ok=True)
except OSError:
pass
def _ws_close_reason(text: str) -> str:
"""Clamp a WS close reason to the protocol's 123-byte UTF-8 limit.
@ -11807,11 +11865,32 @@ async def pty_ws(ws: WebSocket) -> None:
profile = ws.query_params.get("profile") or None
channel = _channel_or_close_code(ws)
sidecar_url = _build_sidecar_url(channel) if channel else None
force_fresh = (ws.query_params.get("fresh") or "").strip().lower() in {
"1",
"true",
"yes",
"on",
}
active_session_file: Optional[Path] = None
if channel:
active_session_file = _active_session_file_for_channel(ws.app, channel)
if force_fresh:
resume = None
_forget_active_session_file(active_session_file)
elif not resume:
resume = _read_active_session_file(active_session_file)
resolve_kwargs = {
"resume": resume,
"sidecar_url": sidecar_url,
"profile": profile,
}
if active_session_file is not None:
resolve_kwargs["active_session_file"] = str(active_session_file)
try:
argv, cwd, env = await _resolve_chat_argv_async(
resume=resume, sidecar_url=sidecar_url, profile=profile
)
argv, cwd, env = await _resolve_chat_argv_async(**resolve_kwargs)
except HTTPException as exc:
# Unknown/invalid profile from _resolve_profile_dir.
await ws.send_text(f"\r\n\x1b[31mChat unavailable: {exc.detail}\x1b[0m\r\n")