diff --git a/hermes_logging.py b/hermes_logging.py index 2de105b2d9e..b7d9c9e5513 100644 --- a/hermes_logging.py +++ b/hermes_logging.py @@ -296,19 +296,39 @@ def setup_verbose_logging() -> None: # --------------------------------------------------------------------------- class _ManagedRotatingFileHandler(RotatingFileHandler): - """RotatingFileHandler that ensures group-writable perms in managed mode. + """RotatingFileHandler that ensures group-writable perms in managed mode + AND survives external rotation. - In managed mode (NixOS), the stateDir uses setgid (2770) so new files - inherit the hermes group. However, both _open() (initial creation) and - doRollover() create files via open(), which uses the process umask — - typically 0022, producing 0644. This subclass applies chmod 0660 after - both operations so the gateway and interactive users can share log files. + Two responsibilities: + + 1. In managed mode (NixOS), the stateDir uses setgid (2770) so new files + inherit the hermes group. However, both ``_open()`` (initial creation) + and ``doRollover()`` create files via ``open()``, which uses the + process umask — typically 0022, producing 0644. This subclass applies + ``chmod 0660`` after both operations so the gateway and interactive + users can share log files. + + 2. ``RotatingFileHandler`` keeps an open file descriptor. If anything + rotates the file *externally* (``logrotate``, manual ``mv``, + another process rotating under us, a transient unlink), our fd + keeps pointing at the renamed/unlinked inode and every subsequent + write goes to ``gateway.log.1`` instead of ``gateway.log`` — silent + log loss for the file every operator expects to read. Before each + emit we ``stat`` ``baseFilename`` and compare it against the open + stream's inode; on mismatch we reopen. This is the same pattern + as stdlib ``WatchedFileHandler.reopenIfNeeded()``, adapted for + rotating handlers. """ def __init__(self, *args, **kwargs): from hermes_cli.config import is_managed self._managed = is_managed() super().__init__(*args, **kwargs) + # Snapshot the inode of the currently open stream so emit() can + # detect external rotation without an extra fstat per write. + self._stat_dev: Optional[int] = None + self._stat_ino: Optional[int] = None + self._record_stream_stat() def _chmod_if_managed(self): if self._managed: @@ -317,6 +337,70 @@ class _ManagedRotatingFileHandler(RotatingFileHandler): except OSError: pass + def _record_stream_stat(self) -> None: + """Snapshot dev/ino of ``baseFilename`` so we can detect external rotation.""" + try: + st = os.stat(self.baseFilename) + self._stat_dev, self._stat_ino = st.st_dev, st.st_ino + except OSError: + self._stat_dev, self._stat_ino = None, None + + def _reopen_if_externally_rotated(self) -> None: + """Reopen the stream when ``baseFilename`` no longer matches our fd. + + Triggered when ``baseFilename`` was renamed (logrotate), unlinked, + or replaced by a different inode. Silent + best-effort: any error + falls back to the existing (possibly stale) stream so logging keeps + working instead of dying on a stat failure. + """ + try: + st = os.stat(self.baseFilename) + except FileNotFoundError: + # File was rotated/unlinked underneath us. Close + reopen so a + # fresh inode is created at the expected path. + try: + if self.stream is not None: + self.stream.close() + except Exception: + pass + self.stream = None # type: ignore[assignment] + try: + self.stream = self._open() + self._record_stream_stat() + except Exception: + # Couldn't reopen — leave stream=None; next emit will + # bail rather than write to a stale inode. + pass + return + except OSError: + return # transient — try again on the next emit + + if self._stat_dev is None or self._stat_ino is None: + self._stat_dev, self._stat_ino = st.st_dev, st.st_ino + return + + if (st.st_dev, st.st_ino) != (self._stat_dev, self._stat_ino): + # baseFilename now points at a DIFFERENT inode than the one we + # hold open. Close the old stream and open the new file. + try: + if self.stream is not None: + self.stream.close() + except Exception: + pass + self.stream = None # type: ignore[assignment] + try: + self.stream = self._open() + self._stat_dev, self._stat_ino = st.st_dev, st.st_ino + except Exception: + pass + + def emit(self, record: logging.LogRecord) -> None: + # Cheap-ish stat-per-record check; the kernel caches inode metadata + # so the syscall is sub-microsecond on a hot file. + if self.stream is not None or os.path.exists(self.baseFilename): + self._reopen_if_externally_rotated() + super().emit(record) + def _open(self): stream = super()._open() self._chmod_if_managed() @@ -325,6 +409,9 @@ class _ManagedRotatingFileHandler(RotatingFileHandler): def doRollover(self): super().doRollover() self._chmod_if_managed() + # Our own rollover writes a new baseFilename; refresh the snapshot + # so the next emit doesn't mistake it for external rotation. + self._record_stream_stat() def _add_rotating_handler( diff --git a/tests/test_hermes_logging.py b/tests/test_hermes_logging.py index 8eed1c9a1bf..999db56c2a4 100644 --- a/tests/test_hermes_logging.py +++ b/tests/test_hermes_logging.py @@ -774,3 +774,174 @@ class TestReadLoggingConfig: level, max_size, backup = hermes_logging._read_logging_config() assert level is None + + +class TestExternalRotationRecovery: + """_ManagedRotatingFileHandler recovers from external rotation. + + External rotation = anything that renames, unlinks, or replaces the + log file without going through ``doRollover()``: logrotate, manual + ``mv``, another process rotating under us, or a transient ``rm``. + Before this fix the open file descriptor stayed pinned to the old + inode forever, so every subsequent write went to the rotated backup + instead of the file the operator expects to read. + """ + + def _make_handler(self, log_path: Path) -> hermes_logging._ManagedRotatingFileHandler: + handler = hermes_logging._ManagedRotatingFileHandler( + str(log_path), maxBytes=10 * 1024 * 1024, backupCount=3, + encoding="utf-8", + ) + handler.setLevel(logging.INFO) + handler.setFormatter(logging.Formatter("%(message)s")) + return handler + + def _emit(self, handler: logging.Handler, msg: str) -> None: + record = logging.LogRecord( + name="gateway.run", level=logging.INFO, pathname="", lineno=0, + msg=msg, args=(), exc_info=None, + ) + # Match the record factory that hermes_logging installs at import time. + record.session_tag = "" + handler.emit(record) + handler.flush() + + def test_recovers_after_external_rename(self, tmp_path): + """logrotate-style external rename: ``mv gateway.log gateway.log.1``. + + Handler's fd was pinned to the renamed inode; new writes used to + go to ``gateway.log.1`` forever. After fix, the handler reopens + ``gateway.log`` at the original path. + """ + log_path = tmp_path / "gateway.log" + rotated = tmp_path / "gateway.log.1" + handler = self._make_handler(log_path) + try: + self._emit(handler, "before rotation") + assert log_path.read_text() == "before rotation\n" + + # External rotation (NOT via handler.doRollover()). + os.rename(log_path, rotated) + assert not log_path.exists() + + self._emit(handler, "after rotation") + + # The new write should land in a freshly recreated gateway.log, + # not appended to the rotated backup. + assert log_path.exists(), "handler did not recreate gateway.log" + assert log_path.read_text() == "after rotation\n" + assert rotated.read_text() == "before rotation\n" + finally: + handler.close() + + def test_recovers_after_external_unlink(self, tmp_path): + """``rm gateway.log`` then keep writing — handler recreates the file.""" + log_path = tmp_path / "gateway.log" + handler = self._make_handler(log_path) + try: + self._emit(handler, "before unlink") + assert log_path.read_text() == "before unlink\n" + + os.unlink(log_path) + assert not log_path.exists() + + self._emit(handler, "after unlink") + assert log_path.exists() + assert log_path.read_text() == "after unlink\n" + finally: + handler.close() + + def test_external_truncate_does_not_force_reopen(self, tmp_path): + """``: > gateway.log`` keeps the same inode — no reopen needed. + + Truncation in place preserves the inode, so subsequent writes + continue to the same file descriptor. We assert the post-truncate + content reflects the truncate (size shrinks) and then grows with + new writes — i.e. the handler correctly does NOT detect this as + an inode change. + """ + log_path = tmp_path / "gateway.log" + handler = self._make_handler(log_path) + try: + self._emit(handler, "AAAA" * 32) + assert log_path.stat().st_size > 0 + + with open(log_path, "w"): + pass # truncate to zero + assert log_path.stat().st_size == 0 + + self._emit(handler, "after truncate") + assert log_path.read_text() == "after truncate\n" + finally: + handler.close() + + def test_normal_rollover_still_works(self, tmp_path): + """Handler-driven ``doRollover()`` must continue to work normally. + + Regression guard: the inode-snapshot bookkeeping must be refreshed + in ``doRollover()`` so the very next emit doesn't mistake our own + rollover for an external one and double-reopen. + """ + log_path = tmp_path / "gateway.log" + rotated = tmp_path / "gateway.log.1" + + # Tiny maxBytes forces rollover after the first record. + handler = hermes_logging._ManagedRotatingFileHandler( + str(log_path), maxBytes=1, backupCount=1, encoding="utf-8", + ) + handler.setLevel(logging.INFO) + handler.setFormatter(logging.Formatter("%(message)s")) + try: + self._emit(handler, "first record") + self._emit(handler, "second record") + self._emit(handler, "third record") + + # After rollover we should have BOTH files, with the most + # recent record in the live file. + assert log_path.exists() + assert rotated.exists() + assert "third record" in log_path.read_text() + finally: + handler.close() + + def test_gateway_log_attached_after_external_rotation_then_re_setup( + self, hermes_home, + ): + """End-to-end Allen-reproduction: gateway.log gets externally rotated, + ``setup_logging(mode='gateway')`` is re-called, the handler keeps + working. + + Reproduces Allen's symptom (gateway.log frozen mid-write, all gateway + records leaking to agent.log) when something external rotates the + file between setup_logging() calls. + """ + hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway") + gw_path = hermes_home / "logs" / "gateway.log" + rotated = hermes_home / "logs" / "gateway.log.1" + + logging.getLogger("gateway.run").info("line BEFORE rotation") + for h in logging.getLogger().handlers: + try: h.flush() + except Exception: pass + assert "BEFORE rotation" in gw_path.read_text() + + # External actor renames the file out from under us. + os.rename(gw_path, rotated) + assert not gw_path.exists() + + # Caller (or some restart path) re-enters setup_logging. This used + # to silently no-op due to the per-path dedup check, leaving the + # stale fd in place. + hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway") + + logging.getLogger("gateway.run").info("line AFTER rotation") + for h in logging.getLogger().handlers: + try: h.flush() + except Exception: pass + + # The new record must reach the live gateway.log, not the rotated + # backup. Allen's logs had everything past the rotation point + # going into agent.log only, never gateway.log. + assert gw_path.exists(), "gateway.log was never recreated" + assert "AFTER rotation" in gw_path.read_text() + assert "AFTER rotation" not in rotated.read_text()