fix(update): survive mid-update terminal disconnect (#11960)

hermes update no longer dies when the controlling terminal closes
(SSH drop, shell close) during pip install.  SIGHUP is set to SIG_IGN
for the duration of the update, and stdout/stderr are wrapped so writes
to a closed pipe are absorbed instead of cascading into process exit.
All update output is mirrored to ~/.hermes/logs/update.log so users can
see what happened after reconnecting.

SIGINT (Ctrl-C) and SIGTERM (systemd) are intentionally still honored —
those are deliberate cancellations, not accidents.  In gateway mode the
helper is a no-op since the update is already detached.

POSIX preserves SIG_IGN across exec(), so pip and git subprocesses
inherit hangup protection automatically — no changes to subprocess
spawning needed.
This commit is contained in:
Teknium 2026-04-17 21:29:24 -07:00 committed by GitHub
parent 1c352f6b1d
commit 8a59f8a9ed
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 534 additions and 1 deletions

View file

@ -4985,8 +4985,187 @@ def _update_node_dependencies() -> None:
print(f" {stderr.splitlines()[-1]}")
class _UpdateOutputStream:
"""Stream wrapper used during ``hermes update`` to survive terminal loss.
Wraps the process's original stdout/stderr so that:
* Every write is also mirrored to an append-only log file
(``~/.hermes/logs/update.log``) that users can inspect after the
terminal disconnects.
* Writes to the original stream that fail with ``BrokenPipeError`` /
``OSError`` / ``ValueError`` (closed file) no longer cascade into
process exit the update keeps going, only the on-screen output
stops.
Combined with ``SIGHUP -> SIG_IGN`` installed by
``_install_hangup_protection``, this makes ``hermes update`` safe to
run in a plain SSH session that might disconnect mid-install.
"""
def __init__(self, original, log_file):
self._original = original
self._log = log_file
self._original_broken = False
def write(self, data):
# Mirror to the log file first — it's the most reliable destination.
if self._log is not None:
try:
self._log.write(data)
except Exception:
# Log errors should never abort the update.
pass
if self._original_broken:
return len(data) if isinstance(data, (str, bytes)) else 0
try:
return self._original.write(data)
except (BrokenPipeError, OSError, ValueError):
# Terminal vanished (SSH disconnect, shell close). Stop trying
# to write to it, but keep the update running.
self._original_broken = True
return len(data) if isinstance(data, (str, bytes)) else 0
def flush(self):
if self._log is not None:
try:
self._log.flush()
except Exception:
pass
if self._original_broken:
return
try:
self._original.flush()
except (BrokenPipeError, OSError, ValueError):
self._original_broken = True
def isatty(self):
if self._original_broken:
return False
try:
return self._original.isatty()
except Exception:
return False
def fileno(self):
# Some tools probe fileno(); defer to the underlying stream and let
# callers handle failures (same behaviour as the unwrapped stream).
return self._original.fileno()
def __getattr__(self, name):
return getattr(self._original, name)
def _install_hangup_protection(gateway_mode: bool = False):
"""Protect ``cmd_update`` from SIGHUP and broken terminal pipes.
Users commonly run ``hermes update`` in an SSH session or a terminal
that may close mid-install. Without protection, ``SIGHUP`` from the
terminal kills the Python process during ``pip install`` and leaves
the venv half-installed; the documented workaround ("use screen /
tmux") shouldn't be required for something as routine as an update.
Protections installed:
1. ``SIGHUP`` is set to ``SIG_IGN``. POSIX preserves ``SIG_IGN``
across ``exec()``, so pip and git subprocesses also stop dying on
hangup.
2. ``sys.stdout`` / ``sys.stderr`` are wrapped to mirror output to
``~/.hermes/logs/update.log`` and to silently absorb
``BrokenPipeError`` when the terminal vanishes.
``SIGINT`` (Ctrl-C) and ``SIGTERM`` (systemd shutdown) are
**intentionally left alone** those are legitimate cancellation
signals the user or OS sent on purpose.
In gateway mode (``hermes update --gateway``) the update is already
spawned detached from a terminal, so this function is a no-op.
Returns a dict that ``cmd_update`` can pass to
``_finalize_update_output`` on exit. Returning a dict rather than a
tuple keeps the call site forward-compatible with future additions.
"""
state = {
"prev_stdout": sys.stdout,
"prev_stderr": sys.stderr,
"log_file": None,
"installed": False,
}
if gateway_mode:
return state
import signal as _signal
# (1) Ignore SIGHUP for the remainder of this process.
if hasattr(_signal, "SIGHUP"):
try:
_signal.signal(_signal.SIGHUP, _signal.SIG_IGN)
except (ValueError, OSError):
# Called from a non-main thread — not fatal. The update still
# runs, just without hangup protection.
pass
# (2) Mirror output to update.log and wrap stdio for broken-pipe
# tolerance. Any failure here is non-fatal; we just skip the wrap.
try:
from hermes_cli.config import get_hermes_home
logs_dir = get_hermes_home() / "logs"
logs_dir.mkdir(parents=True, exist_ok=True)
log_path = logs_dir / "update.log"
log_file = open(log_path, "a", buffering=1, encoding="utf-8")
import datetime as _dt
log_file.write(
f"\n=== hermes update started "
f"{_dt.datetime.now().isoformat(timespec='seconds')} ===\n"
)
state["log_file"] = log_file
sys.stdout = _UpdateOutputStream(state["prev_stdout"], log_file)
sys.stderr = _UpdateOutputStream(state["prev_stderr"], log_file)
state["installed"] = True
except Exception:
# Leave stdio untouched on any setup failure. Update continues
# without mirroring.
state["log_file"] = None
return state
def _finalize_update_output(state):
"""Restore stdio and close the update.log handle opened by ``_install_hangup_protection``."""
if not state:
return
if state.get("installed"):
try:
sys.stdout = state.get("prev_stdout", sys.stdout)
except Exception:
pass
try:
sys.stderr = state.get("prev_stderr", sys.stderr)
except Exception:
pass
log_file = state.get("log_file")
if log_file is not None:
try:
log_file.flush()
log_file.close()
except Exception:
pass
def cmd_update(args):
"""Update Hermes Agent to the latest version."""
"""Update Hermes Agent to the latest version.
Thin wrapper around ``_cmd_update_impl``: installs hangup protection,
runs the update, then restores stdio on the way out (even on
``sys.exit`` or unhandled exceptions).
"""
from hermes_cli.config import is_managed, managed_error
if is_managed():
@ -4994,6 +5173,20 @@ def cmd_update(args):
return
gateway_mode = getattr(args, "gateway", False)
# Protect against mid-update terminal disconnects (SIGHUP) and tolerate
# writes to a closed stdout. No-op in gateway mode. See
# _install_hangup_protection for rationale.
_update_io_state = _install_hangup_protection(gateway_mode=gateway_mode)
try:
_cmd_update_impl(args, gateway_mode=gateway_mode)
finally:
_finalize_update_output(_update_io_state)
def _cmd_update_impl(args, gateway_mode: bool):
"""Body of ``cmd_update`` — kept separate so the wrapper can always
restore stdio even on ``sys.exit``."""
# In gateway mode, use file-based IPC for prompts instead of stdin
gw_input_fn = (
(lambda prompt, default="": _gateway_prompt(prompt, default))

View file

@ -0,0 +1,325 @@
"""Tests for SIGHUP protection and stdout mirroring in ``hermes update``.
Covers ``_UpdateOutputStream``, ``_install_hangup_protection``, and
``_finalize_update_output`` in ``hermes_cli/main.py``. These exist so
that ``hermes update`` survives a terminal disconnect mid-install
(SSH drop, shell close) without leaving the venv half-installed.
"""
from __future__ import annotations
import io
import os
import signal
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
from hermes_cli.main import (
_UpdateOutputStream,
_finalize_update_output,
_install_hangup_protection,
)
# -----------------------------------------------------------------------------
# _UpdateOutputStream
# -----------------------------------------------------------------------------
class TestUpdateOutputStream:
def test_write_mirrors_to_both_original_and_log(self):
original = io.StringIO()
log = io.StringIO()
stream = _UpdateOutputStream(original, log)
stream.write("hello world\n")
assert original.getvalue() == "hello world\n"
assert log.getvalue() == "hello world\n"
def test_write_continues_after_broken_original(self):
"""When the terminal disconnects, original.write raises BrokenPipeError.
The wrapper must catch it, flip the broken flag, and keep writing to
the log from then on.
"""
log = io.StringIO()
class _BrokenStream:
def write(self, data):
raise BrokenPipeError("terminal gone")
def flush(self):
raise BrokenPipeError("terminal gone")
stream = _UpdateOutputStream(_BrokenStream(), log)
# First write triggers the broken-pipe path.
stream.write("first line\n")
# Subsequent writes take the fast broken path (no exception).
stream.write("second line\n")
assert log.getvalue() == "first line\nsecond line\n"
assert stream._original_broken is True
def test_write_tolerates_oserror_and_valueerror(self):
"""OSError (EIO) and ValueError (closed file) should also be absorbed."""
log = io.StringIO()
class _RaisingStream:
def __init__(self, exc):
self._exc = exc
def write(self, data):
raise self._exc
def flush(self):
raise self._exc
for exc in (OSError("EIO"), ValueError("closed file")):
stream = _UpdateOutputStream(_RaisingStream(exc), log)
stream.write("x\n")
assert stream._original_broken is True
def test_log_failure_does_not_abort_write(self):
"""Even if the log file write raises, the original write must still happen."""
class _BrokenLog:
def write(self, data):
raise OSError("disk full")
def flush(self):
raise OSError("disk full")
original = io.StringIO()
stream = _UpdateOutputStream(original, _BrokenLog())
stream.write("data\n")
assert original.getvalue() == "data\n"
def test_flush_tolerates_broken_original(self):
class _BrokenStream:
def write(self, data):
return len(data)
def flush(self):
raise BrokenPipeError("gone")
log = io.StringIO()
stream = _UpdateOutputStream(_BrokenStream(), log)
stream.flush() # must not raise
assert stream._original_broken is True
def test_isatty_delegates_to_original(self):
class _TtyStream:
def isatty(self):
return True
def write(self, data):
return len(data)
def flush(self):
return None
stream = _UpdateOutputStream(_TtyStream(), io.StringIO())
assert stream.isatty() is True
def test_isatty_returns_false_after_broken(self):
class _BrokenStream:
def isatty(self):
return True
def write(self, data):
raise BrokenPipeError()
def flush(self):
return None
stream = _UpdateOutputStream(_BrokenStream(), io.StringIO())
stream.write("x") # marks broken
assert stream.isatty() is False
def test_getattr_delegates_unknown_attrs(self):
class _StreamWithEncoding:
encoding = "utf-8"
def write(self, data):
return len(data)
def flush(self):
return None
stream = _UpdateOutputStream(_StreamWithEncoding(), io.StringIO())
assert stream.encoding == "utf-8"
# -----------------------------------------------------------------------------
# _install_hangup_protection
# -----------------------------------------------------------------------------
class TestInstallHangupProtection:
def test_gateway_mode_is_noop(self):
"""In gateway mode the process is already detached — don't touch stdio or signals."""
prev_out, prev_err = sys.stdout, sys.stderr
prev_sighup = signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None
state = _install_hangup_protection(gateway_mode=True)
try:
assert sys.stdout is prev_out
assert sys.stderr is prev_err
assert state["log_file"] is None
assert state["installed"] is False
if hasattr(signal, "SIGHUP"):
assert signal.getsignal(signal.SIGHUP) == prev_sighup
finally:
_finalize_update_output(state)
@pytest.mark.skipif(
not hasattr(signal, "SIGHUP"), reason="SIGHUP not available on this platform"
)
def test_installs_sighup_ignore(self, tmp_path, monkeypatch):
"""SIGHUP should be set to SIG_IGN so SSH disconnect doesn't kill the update."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Clear cached get_hermes_home if present
import hermes_cli.config as _cfg
if hasattr(_cfg, "_HERMES_HOME_CACHE"):
_cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined]
original_handler = signal.getsignal(signal.SIGHUP)
state = _install_hangup_protection(gateway_mode=False)
try:
assert signal.getsignal(signal.SIGHUP) == signal.SIG_IGN
finally:
_finalize_update_output(state)
# Restore whatever was there before so we don't leak to other tests.
signal.signal(signal.SIGHUP, original_handler)
def test_wraps_stdout_and_stderr_with_mirror(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# Nuke any cached home path
import hermes_cli.config as _cfg
if hasattr(_cfg, "_HERMES_HOME_CACHE"):
_cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined]
prev_out, prev_err = sys.stdout, sys.stderr
state = _install_hangup_protection(gateway_mode=False)
try:
# On Windows (no SIGHUP) we still wrap stdio and create the log.
assert state["installed"] is True
assert isinstance(sys.stdout, _UpdateOutputStream)
assert isinstance(sys.stderr, _UpdateOutputStream)
assert state["log_file"] is not None
sys.stdout.write("checking mirror\n")
sys.stdout.flush()
log_path = tmp_path / "logs" / "update.log"
assert log_path.exists()
contents = log_path.read_text(encoding="utf-8")
assert "checking mirror" in contents
assert "hermes update started" in contents
finally:
_finalize_update_output(state)
# Sanity-check restoration
assert sys.stdout is prev_out
assert sys.stderr is prev_err
def test_logs_dir_created_if_missing(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
import hermes_cli.config as _cfg
if hasattr(_cfg, "_HERMES_HOME_CACHE"):
_cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined]
# No logs/ dir yet.
assert not (tmp_path / "logs").exists()
state = _install_hangup_protection(gateway_mode=False)
try:
assert (tmp_path / "logs").is_dir()
assert (tmp_path / "logs" / "update.log").exists()
finally:
_finalize_update_output(state)
def test_non_fatal_if_log_setup_fails(self, monkeypatch):
"""If get_hermes_home() raises, stdio must be left untouched but SIGHUP still handled."""
prev_out, prev_err = sys.stdout, sys.stderr
def _boom():
raise RuntimeError("no home for you")
# Patch the import inside _install_hangup_protection.
monkeypatch.setattr(
"hermes_cli.config.get_hermes_home", _boom, raising=True
)
original_handler = (
signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None
)
state = _install_hangup_protection(gateway_mode=False)
try:
assert sys.stdout is prev_out
assert sys.stderr is prev_err
assert state["installed"] is False
# SIGHUP must still be installed even when log setup fails.
if hasattr(signal, "SIGHUP"):
assert signal.getsignal(signal.SIGHUP) == signal.SIG_IGN
finally:
_finalize_update_output(state)
if hasattr(signal, "SIGHUP") and original_handler is not None:
signal.signal(signal.SIGHUP, original_handler)
# -----------------------------------------------------------------------------
# _finalize_update_output
# -----------------------------------------------------------------------------
class TestFinalizeUpdateOutput:
def test_none_state_is_noop(self):
_finalize_update_output(None) # must not raise
def test_restores_streams_and_closes_log(self, tmp_path, monkeypatch):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
import hermes_cli.config as _cfg
if hasattr(_cfg, "_HERMES_HOME_CACHE"):
_cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined]
prev_out = sys.stdout
state = _install_hangup_protection(gateway_mode=False)
log_file = state["log_file"]
assert sys.stdout is not prev_out
assert log_file is not None
_finalize_update_output(state)
assert sys.stdout is prev_out
# The log file handle should be closed.
assert log_file.closed is True
def test_skipped_install_leaves_stdio_alone(self):
"""When install failed (state['installed']=False) finalize should not
touch sys.stdout / sys.stderr (they were never wrapped)."""
# Build a synthetic state that mimics a failed install.
sentinel_out = object()
state = {
"prev_stdout": sentinel_out,
"prev_stderr": sentinel_out,
"log_file": None,
"installed": False,
}
before_out, before_err = sys.stdout, sys.stderr
_finalize_update_output(state)
assert sys.stdout is before_out
assert sys.stderr is before_err

View file

@ -59,6 +59,21 @@ Already up to date. (or: Updating abc1234..def5678)
If `git status --short` shows unexpected changes after `hermes update`, stop and inspect them before continuing. This usually means local modifications were reapplied on top of the updated code, or a dependency step refreshed lockfiles.
:::
### If your terminal disconnects mid-update
`hermes update` protects itself against accidental terminal loss:
- The update ignores `SIGHUP`, so closing your SSH session or terminal window no longer kills it mid-install. `pip` and `git` child processes inherit this protection, so the Python environment cannot be left half-installed by a dropped connection.
- All output is mirrored to `~/.hermes/logs/update.log` while the update runs. If your terminal disappears, reconnect and inspect the log to see whether the update finished and whether the gateway restart succeeded:
```bash
tail -f ~/.hermes/logs/update.log
```
- `Ctrl-C` (SIGINT) and system shutdown (SIGTERM) are still honored — those are deliberate cancellations, not accidents.
You no longer need to wrap `hermes update` in `screen` or `tmux` to survive a terminal drop.
### Checking your current version
```bash