From 8a59f8a9edcf6a23cffda3377cced2761732e7bc Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Fri, 17 Apr 2026 21:29:24 -0700 Subject: [PATCH] fix(update): survive mid-update terminal disconnect (#11960) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hermes update no longer dies when the controlling terminal closes (SSH drop, shell close) during pip install. SIGHUP is set to SIG_IGN for the duration of the update, and stdout/stderr are wrapped so writes to a closed pipe are absorbed instead of cascading into process exit. All update output is mirrored to ~/.hermes/logs/update.log so users can see what happened after reconnecting. SIGINT (Ctrl-C) and SIGTERM (systemd) are intentionally still honored — those are deliberate cancellations, not accidents. In gateway mode the helper is a no-op since the update is already detached. POSIX preserves SIG_IGN across exec(), so pip and git subprocesses inherit hangup protection automatically — no changes to subprocess spawning needed. --- hermes_cli/main.py | 195 ++++++++++- .../test_update_hangup_protection.py | 325 ++++++++++++++++++ website/docs/getting-started/updating.md | 15 + 3 files changed, 534 insertions(+), 1 deletion(-) create mode 100644 tests/hermes_cli/test_update_hangup_protection.py diff --git a/hermes_cli/main.py b/hermes_cli/main.py index 81b27e4a1..0afadac3d 100644 --- a/hermes_cli/main.py +++ b/hermes_cli/main.py @@ -4985,8 +4985,187 @@ def _update_node_dependencies() -> None: print(f" {stderr.splitlines()[-1]}") +class _UpdateOutputStream: + """Stream wrapper used during ``hermes update`` to survive terminal loss. + + Wraps the process's original stdout/stderr so that: + + * Every write is also mirrored to an append-only log file + (``~/.hermes/logs/update.log``) that users can inspect after the + terminal disconnects. + * Writes to the original stream that fail with ``BrokenPipeError`` / + ``OSError`` / ``ValueError`` (closed file) no longer cascade into + process exit — the update keeps going, only the on-screen output + stops. + + Combined with ``SIGHUP -> SIG_IGN`` installed by + ``_install_hangup_protection``, this makes ``hermes update`` safe to + run in a plain SSH session that might disconnect mid-install. + """ + + def __init__(self, original, log_file): + self._original = original + self._log = log_file + self._original_broken = False + + def write(self, data): + # Mirror to the log file first — it's the most reliable destination. + if self._log is not None: + try: + self._log.write(data) + except Exception: + # Log errors should never abort the update. + pass + + if self._original_broken: + return len(data) if isinstance(data, (str, bytes)) else 0 + + try: + return self._original.write(data) + except (BrokenPipeError, OSError, ValueError): + # Terminal vanished (SSH disconnect, shell close). Stop trying + # to write to it, but keep the update running. + self._original_broken = True + return len(data) if isinstance(data, (str, bytes)) else 0 + + def flush(self): + if self._log is not None: + try: + self._log.flush() + except Exception: + pass + if self._original_broken: + return + try: + self._original.flush() + except (BrokenPipeError, OSError, ValueError): + self._original_broken = True + + def isatty(self): + if self._original_broken: + return False + try: + return self._original.isatty() + except Exception: + return False + + def fileno(self): + # Some tools probe fileno(); defer to the underlying stream and let + # callers handle failures (same behaviour as the unwrapped stream). + return self._original.fileno() + + def __getattr__(self, name): + return getattr(self._original, name) + + +def _install_hangup_protection(gateway_mode: bool = False): + """Protect ``cmd_update`` from SIGHUP and broken terminal pipes. + + Users commonly run ``hermes update`` in an SSH session or a terminal + that may close mid-install. Without protection, ``SIGHUP`` from the + terminal kills the Python process during ``pip install`` and leaves + the venv half-installed; the documented workaround ("use screen / + tmux") shouldn't be required for something as routine as an update. + + Protections installed: + + 1. ``SIGHUP`` is set to ``SIG_IGN``. POSIX preserves ``SIG_IGN`` + across ``exec()``, so pip and git subprocesses also stop dying on + hangup. + 2. ``sys.stdout`` / ``sys.stderr`` are wrapped to mirror output to + ``~/.hermes/logs/update.log`` and to silently absorb + ``BrokenPipeError`` when the terminal vanishes. + + ``SIGINT`` (Ctrl-C) and ``SIGTERM`` (systemd shutdown) are + **intentionally left alone** — those are legitimate cancellation + signals the user or OS sent on purpose. + + In gateway mode (``hermes update --gateway``) the update is already + spawned detached from a terminal, so this function is a no-op. + + Returns a dict that ``cmd_update`` can pass to + ``_finalize_update_output`` on exit. Returning a dict rather than a + tuple keeps the call site forward-compatible with future additions. + """ + state = { + "prev_stdout": sys.stdout, + "prev_stderr": sys.stderr, + "log_file": None, + "installed": False, + } + + if gateway_mode: + return state + + import signal as _signal + + # (1) Ignore SIGHUP for the remainder of this process. + if hasattr(_signal, "SIGHUP"): + try: + _signal.signal(_signal.SIGHUP, _signal.SIG_IGN) + except (ValueError, OSError): + # Called from a non-main thread — not fatal. The update still + # runs, just without hangup protection. + pass + + # (2) Mirror output to update.log and wrap stdio for broken-pipe + # tolerance. Any failure here is non-fatal; we just skip the wrap. + try: + from hermes_cli.config import get_hermes_home + + logs_dir = get_hermes_home() / "logs" + logs_dir.mkdir(parents=True, exist_ok=True) + log_path = logs_dir / "update.log" + log_file = open(log_path, "a", buffering=1, encoding="utf-8") + + import datetime as _dt + + log_file.write( + f"\n=== hermes update started " + f"{_dt.datetime.now().isoformat(timespec='seconds')} ===\n" + ) + + state["log_file"] = log_file + sys.stdout = _UpdateOutputStream(state["prev_stdout"], log_file) + sys.stderr = _UpdateOutputStream(state["prev_stderr"], log_file) + state["installed"] = True + except Exception: + # Leave stdio untouched on any setup failure. Update continues + # without mirroring. + state["log_file"] = None + + return state + + +def _finalize_update_output(state): + """Restore stdio and close the update.log handle opened by ``_install_hangup_protection``.""" + if not state: + return + if state.get("installed"): + try: + sys.stdout = state.get("prev_stdout", sys.stdout) + except Exception: + pass + try: + sys.stderr = state.get("prev_stderr", sys.stderr) + except Exception: + pass + log_file = state.get("log_file") + if log_file is not None: + try: + log_file.flush() + log_file.close() + except Exception: + pass + + def cmd_update(args): - """Update Hermes Agent to the latest version.""" + """Update Hermes Agent to the latest version. + + Thin wrapper around ``_cmd_update_impl``: installs hangup protection, + runs the update, then restores stdio on the way out (even on + ``sys.exit`` or unhandled exceptions). + """ from hermes_cli.config import is_managed, managed_error if is_managed(): @@ -4994,6 +5173,20 @@ def cmd_update(args): return gateway_mode = getattr(args, "gateway", False) + + # Protect against mid-update terminal disconnects (SIGHUP) and tolerate + # writes to a closed stdout. No-op in gateway mode. See + # _install_hangup_protection for rationale. + _update_io_state = _install_hangup_protection(gateway_mode=gateway_mode) + try: + _cmd_update_impl(args, gateway_mode=gateway_mode) + finally: + _finalize_update_output(_update_io_state) + + +def _cmd_update_impl(args, gateway_mode: bool): + """Body of ``cmd_update`` — kept separate so the wrapper can always + restore stdio even on ``sys.exit``.""" # In gateway mode, use file-based IPC for prompts instead of stdin gw_input_fn = ( (lambda prompt, default="": _gateway_prompt(prompt, default)) diff --git a/tests/hermes_cli/test_update_hangup_protection.py b/tests/hermes_cli/test_update_hangup_protection.py new file mode 100644 index 000000000..e5c81a45a --- /dev/null +++ b/tests/hermes_cli/test_update_hangup_protection.py @@ -0,0 +1,325 @@ +"""Tests for SIGHUP protection and stdout mirroring in ``hermes update``. + +Covers ``_UpdateOutputStream``, ``_install_hangup_protection``, and +``_finalize_update_output`` in ``hermes_cli/main.py``. These exist so +that ``hermes update`` survives a terminal disconnect mid-install +(SSH drop, shell close) without leaving the venv half-installed. +""" + +from __future__ import annotations + +import io +import os +import signal +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +from hermes_cli.main import ( + _UpdateOutputStream, + _finalize_update_output, + _install_hangup_protection, +) + + +# ----------------------------------------------------------------------------- +# _UpdateOutputStream +# ----------------------------------------------------------------------------- + + +class TestUpdateOutputStream: + def test_write_mirrors_to_both_original_and_log(self): + original = io.StringIO() + log = io.StringIO() + stream = _UpdateOutputStream(original, log) + + stream.write("hello world\n") + + assert original.getvalue() == "hello world\n" + assert log.getvalue() == "hello world\n" + + def test_write_continues_after_broken_original(self): + """When the terminal disconnects, original.write raises BrokenPipeError. + + The wrapper must catch it, flip the broken flag, and keep writing to + the log from then on. + """ + log = io.StringIO() + + class _BrokenStream: + def write(self, data): + raise BrokenPipeError("terminal gone") + + def flush(self): + raise BrokenPipeError("terminal gone") + + stream = _UpdateOutputStream(_BrokenStream(), log) + + # First write triggers the broken-pipe path. + stream.write("first line\n") + # Subsequent writes take the fast broken path (no exception). + stream.write("second line\n") + + assert log.getvalue() == "first line\nsecond line\n" + assert stream._original_broken is True + + def test_write_tolerates_oserror_and_valueerror(self): + """OSError (EIO) and ValueError (closed file) should also be absorbed.""" + log = io.StringIO() + + class _RaisingStream: + def __init__(self, exc): + self._exc = exc + + def write(self, data): + raise self._exc + + def flush(self): + raise self._exc + + for exc in (OSError("EIO"), ValueError("closed file")): + stream = _UpdateOutputStream(_RaisingStream(exc), log) + stream.write("x\n") + assert stream._original_broken is True + + def test_log_failure_does_not_abort_write(self): + """Even if the log file write raises, the original write must still happen.""" + class _BrokenLog: + def write(self, data): + raise OSError("disk full") + + def flush(self): + raise OSError("disk full") + + original = io.StringIO() + stream = _UpdateOutputStream(original, _BrokenLog()) + + stream.write("data\n") + + assert original.getvalue() == "data\n" + + def test_flush_tolerates_broken_original(self): + class _BrokenStream: + def write(self, data): + return len(data) + + def flush(self): + raise BrokenPipeError("gone") + + log = io.StringIO() + stream = _UpdateOutputStream(_BrokenStream(), log) + stream.flush() # must not raise + assert stream._original_broken is True + + def test_isatty_delegates_to_original(self): + class _TtyStream: + def isatty(self): + return True + + def write(self, data): + return len(data) + + def flush(self): + return None + + stream = _UpdateOutputStream(_TtyStream(), io.StringIO()) + assert stream.isatty() is True + + def test_isatty_returns_false_after_broken(self): + class _BrokenStream: + def isatty(self): + return True + + def write(self, data): + raise BrokenPipeError() + + def flush(self): + return None + + stream = _UpdateOutputStream(_BrokenStream(), io.StringIO()) + stream.write("x") # marks broken + assert stream.isatty() is False + + def test_getattr_delegates_unknown_attrs(self): + class _StreamWithEncoding: + encoding = "utf-8" + + def write(self, data): + return len(data) + + def flush(self): + return None + + stream = _UpdateOutputStream(_StreamWithEncoding(), io.StringIO()) + assert stream.encoding == "utf-8" + + +# ----------------------------------------------------------------------------- +# _install_hangup_protection +# ----------------------------------------------------------------------------- + + +class TestInstallHangupProtection: + def test_gateway_mode_is_noop(self): + """In gateway mode the process is already detached — don't touch stdio or signals.""" + prev_out, prev_err = sys.stdout, sys.stderr + prev_sighup = signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None + + state = _install_hangup_protection(gateway_mode=True) + + try: + assert sys.stdout is prev_out + assert sys.stderr is prev_err + assert state["log_file"] is None + assert state["installed"] is False + if hasattr(signal, "SIGHUP"): + assert signal.getsignal(signal.SIGHUP) == prev_sighup + finally: + _finalize_update_output(state) + + @pytest.mark.skipif( + not hasattr(signal, "SIGHUP"), reason="SIGHUP not available on this platform" + ) + def test_installs_sighup_ignore(self, tmp_path, monkeypatch): + """SIGHUP should be set to SIG_IGN so SSH disconnect doesn't kill the update.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Clear cached get_hermes_home if present + import hermes_cli.config as _cfg + if hasattr(_cfg, "_HERMES_HOME_CACHE"): + _cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined] + + original_handler = signal.getsignal(signal.SIGHUP) + state = _install_hangup_protection(gateway_mode=False) + + try: + assert signal.getsignal(signal.SIGHUP) == signal.SIG_IGN + finally: + _finalize_update_output(state) + # Restore whatever was there before so we don't leak to other tests. + signal.signal(signal.SIGHUP, original_handler) + + def test_wraps_stdout_and_stderr_with_mirror(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + # Nuke any cached home path + import hermes_cli.config as _cfg + if hasattr(_cfg, "_HERMES_HOME_CACHE"): + _cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined] + + prev_out, prev_err = sys.stdout, sys.stderr + state = _install_hangup_protection(gateway_mode=False) + + try: + # On Windows (no SIGHUP) we still wrap stdio and create the log. + assert state["installed"] is True + assert isinstance(sys.stdout, _UpdateOutputStream) + assert isinstance(sys.stderr, _UpdateOutputStream) + assert state["log_file"] is not None + + sys.stdout.write("checking mirror\n") + sys.stdout.flush() + + log_path = tmp_path / "logs" / "update.log" + assert log_path.exists() + contents = log_path.read_text(encoding="utf-8") + assert "checking mirror" in contents + assert "hermes update started" in contents + finally: + _finalize_update_output(state) + # Sanity-check restoration + assert sys.stdout is prev_out + assert sys.stderr is prev_err + + def test_logs_dir_created_if_missing(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + import hermes_cli.config as _cfg + if hasattr(_cfg, "_HERMES_HOME_CACHE"): + _cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined] + + # No logs/ dir yet. + assert not (tmp_path / "logs").exists() + + state = _install_hangup_protection(gateway_mode=False) + try: + assert (tmp_path / "logs").is_dir() + assert (tmp_path / "logs" / "update.log").exists() + finally: + _finalize_update_output(state) + + def test_non_fatal_if_log_setup_fails(self, monkeypatch): + """If get_hermes_home() raises, stdio must be left untouched but SIGHUP still handled.""" + prev_out, prev_err = sys.stdout, sys.stderr + + def _boom(): + raise RuntimeError("no home for you") + + # Patch the import inside _install_hangup_protection. + monkeypatch.setattr( + "hermes_cli.config.get_hermes_home", _boom, raising=True + ) + + original_handler = ( + signal.getsignal(signal.SIGHUP) if hasattr(signal, "SIGHUP") else None + ) + + state = _install_hangup_protection(gateway_mode=False) + + try: + assert sys.stdout is prev_out + assert sys.stderr is prev_err + assert state["installed"] is False + # SIGHUP must still be installed even when log setup fails. + if hasattr(signal, "SIGHUP"): + assert signal.getsignal(signal.SIGHUP) == signal.SIG_IGN + finally: + _finalize_update_output(state) + if hasattr(signal, "SIGHUP") and original_handler is not None: + signal.signal(signal.SIGHUP, original_handler) + + +# ----------------------------------------------------------------------------- +# _finalize_update_output +# ----------------------------------------------------------------------------- + + +class TestFinalizeUpdateOutput: + def test_none_state_is_noop(self): + _finalize_update_output(None) # must not raise + + def test_restores_streams_and_closes_log(self, tmp_path, monkeypatch): + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + import hermes_cli.config as _cfg + if hasattr(_cfg, "_HERMES_HOME_CACHE"): + _cfg._HERMES_HOME_CACHE = None # type: ignore[attr-defined] + + prev_out = sys.stdout + state = _install_hangup_protection(gateway_mode=False) + log_file = state["log_file"] + + assert sys.stdout is not prev_out + assert log_file is not None + + _finalize_update_output(state) + + assert sys.stdout is prev_out + # The log file handle should be closed. + assert log_file.closed is True + + def test_skipped_install_leaves_stdio_alone(self): + """When install failed (state['installed']=False) finalize should not + touch sys.stdout / sys.stderr (they were never wrapped).""" + # Build a synthetic state that mimics a failed install. + sentinel_out = object() + state = { + "prev_stdout": sentinel_out, + "prev_stderr": sentinel_out, + "log_file": None, + "installed": False, + } + before_out, before_err = sys.stdout, sys.stderr + + _finalize_update_output(state) + + assert sys.stdout is before_out + assert sys.stderr is before_err diff --git a/website/docs/getting-started/updating.md b/website/docs/getting-started/updating.md index b0e34e07d..eb74427a0 100644 --- a/website/docs/getting-started/updating.md +++ b/website/docs/getting-started/updating.md @@ -59,6 +59,21 @@ Already up to date. (or: Updating abc1234..def5678) If `git status --short` shows unexpected changes after `hermes update`, stop and inspect them before continuing. This usually means local modifications were reapplied on top of the updated code, or a dependency step refreshed lockfiles. ::: +### If your terminal disconnects mid-update + +`hermes update` protects itself against accidental terminal loss: + +- The update ignores `SIGHUP`, so closing your SSH session or terminal window no longer kills it mid-install. `pip` and `git` child processes inherit this protection, so the Python environment cannot be left half-installed by a dropped connection. +- All output is mirrored to `~/.hermes/logs/update.log` while the update runs. If your terminal disappears, reconnect and inspect the log to see whether the update finished and whether the gateway restart succeeded: + +```bash +tail -f ~/.hermes/logs/update.log +``` + +- `Ctrl-C` (SIGINT) and system shutdown (SIGTERM) are still honored — those are deliberate cancellations, not accidents. + +You no longer need to wrap `hermes update` in `screen` or `tmux` to survive a terminal drop. + ### Checking your current version ```bash