fix(logging): recover gateway.log handler from external rotation (#34349)

External rotation (logrotate, manual `mv gateway.log gateway.log.1`,
another process rotating the file) leaves `_ManagedRotatingFileHandler`'s
open fd pinned to the renamed inode. All subsequent writes go to the
rotated backup instead of the file every operator expects to read,
producing the symptom 'gateway.log frozen mid-write while agent.log
keeps growing with gateway.* records'.

PR #16229 fixed the original CLI->gateway init-order bug (#8404) so the
handler attaches in the first place. This is the sibling fix for what
happens after attach, when something external rotates underneath us.

Adds a WatchedFileHandler-style inode check on emit(): if baseFilename
no longer matches the open stream's (dev,ino), close the stale fd and
reopen at the expected path. doRollover() refreshes the snapshot so our
own rollover isn't misidentified as external.

Five regression tests cover the matrix: external rename, external
unlink, external truncate (must NOT trigger reopen — inode unchanged),
normal doRollover() (must still work), and the end-to-end
Allen-reproduction (rotate + re-call setup_logging).

55/55 tests in tests/test_hermes_logging.py pass; 5972/5972 in
tests/gateway/ pass.
This commit is contained in:
Teknium 2026-05-28 22:26:00 -07:00 committed by GitHub
parent a30480bd2b
commit 75d2c081c9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 264 additions and 6 deletions

View file

@ -296,19 +296,39 @@ def setup_verbose_logging() -> None:
# ---------------------------------------------------------------------------
class _ManagedRotatingFileHandler(RotatingFileHandler):
"""RotatingFileHandler that ensures group-writable perms in managed mode.
"""RotatingFileHandler that ensures group-writable perms in managed mode
AND survives external rotation.
In managed mode (NixOS), the stateDir uses setgid (2770) so new files
inherit the hermes group. However, both _open() (initial creation) and
doRollover() create files via open(), which uses the process umask
typically 0022, producing 0644. This subclass applies chmod 0660 after
both operations so the gateway and interactive users can share log files.
Two responsibilities:
1. In managed mode (NixOS), the stateDir uses setgid (2770) so new files
inherit the hermes group. However, both ``_open()`` (initial creation)
and ``doRollover()`` create files via ``open()``, which uses the
process umask typically 0022, producing 0644. This subclass applies
``chmod 0660`` after both operations so the gateway and interactive
users can share log files.
2. ``RotatingFileHandler`` keeps an open file descriptor. If anything
rotates the file *externally* (``logrotate``, manual ``mv``,
another process rotating under us, a transient unlink), our fd
keeps pointing at the renamed/unlinked inode and every subsequent
write goes to ``gateway.log.1`` instead of ``gateway.log`` silent
log loss for the file every operator expects to read. Before each
emit we ``stat`` ``baseFilename`` and compare it against the open
stream's inode; on mismatch we reopen. This is the same pattern
as stdlib ``WatchedFileHandler.reopenIfNeeded()``, adapted for
rotating handlers.
"""
def __init__(self, *args, **kwargs):
from hermes_cli.config import is_managed
self._managed = is_managed()
super().__init__(*args, **kwargs)
# Snapshot the inode of the currently open stream so emit() can
# detect external rotation without an extra fstat per write.
self._stat_dev: Optional[int] = None
self._stat_ino: Optional[int] = None
self._record_stream_stat()
def _chmod_if_managed(self):
if self._managed:
@ -317,6 +337,70 @@ class _ManagedRotatingFileHandler(RotatingFileHandler):
except OSError:
pass
def _record_stream_stat(self) -> None:
"""Snapshot dev/ino of ``baseFilename`` so we can detect external rotation."""
try:
st = os.stat(self.baseFilename)
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
except OSError:
self._stat_dev, self._stat_ino = None, None
def _reopen_if_externally_rotated(self) -> None:
"""Reopen the stream when ``baseFilename`` no longer matches our fd.
Triggered when ``baseFilename`` was renamed (logrotate), unlinked,
or replaced by a different inode. Silent + best-effort: any error
falls back to the existing (possibly stale) stream so logging keeps
working instead of dying on a stat failure.
"""
try:
st = os.stat(self.baseFilename)
except FileNotFoundError:
# File was rotated/unlinked underneath us. Close + reopen so a
# fresh inode is created at the expected path.
try:
if self.stream is not None:
self.stream.close()
except Exception:
pass
self.stream = None # type: ignore[assignment]
try:
self.stream = self._open()
self._record_stream_stat()
except Exception:
# Couldn't reopen — leave stream=None; next emit will
# bail rather than write to a stale inode.
pass
return
except OSError:
return # transient — try again on the next emit
if self._stat_dev is None or self._stat_ino is None:
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
return
if (st.st_dev, st.st_ino) != (self._stat_dev, self._stat_ino):
# baseFilename now points at a DIFFERENT inode than the one we
# hold open. Close the old stream and open the new file.
try:
if self.stream is not None:
self.stream.close()
except Exception:
pass
self.stream = None # type: ignore[assignment]
try:
self.stream = self._open()
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
except Exception:
pass
def emit(self, record: logging.LogRecord) -> None:
# Cheap-ish stat-per-record check; the kernel caches inode metadata
# so the syscall is sub-microsecond on a hot file.
if self.stream is not None or os.path.exists(self.baseFilename):
self._reopen_if_externally_rotated()
super().emit(record)
def _open(self):
stream = super()._open()
self._chmod_if_managed()
@ -325,6 +409,9 @@ class _ManagedRotatingFileHandler(RotatingFileHandler):
def doRollover(self):
super().doRollover()
self._chmod_if_managed()
# Our own rollover writes a new baseFilename; refresh the snapshot
# so the next emit doesn't mistake it for external rotation.
self._record_stream_stat()
def _add_rotating_handler(

View file

@ -774,3 +774,174 @@ class TestReadLoggingConfig:
level, max_size, backup = hermes_logging._read_logging_config()
assert level is None
class TestExternalRotationRecovery:
"""_ManagedRotatingFileHandler recovers from external rotation.
External rotation = anything that renames, unlinks, or replaces the
log file without going through ``doRollover()``: logrotate, manual
``mv``, another process rotating under us, or a transient ``rm``.
Before this fix the open file descriptor stayed pinned to the old
inode forever, so every subsequent write went to the rotated backup
instead of the file the operator expects to read.
"""
def _make_handler(self, log_path: Path) -> hermes_logging._ManagedRotatingFileHandler:
handler = hermes_logging._ManagedRotatingFileHandler(
str(log_path), maxBytes=10 * 1024 * 1024, backupCount=3,
encoding="utf-8",
)
handler.setLevel(logging.INFO)
handler.setFormatter(logging.Formatter("%(message)s"))
return handler
def _emit(self, handler: logging.Handler, msg: str) -> None:
record = logging.LogRecord(
name="gateway.run", level=logging.INFO, pathname="", lineno=0,
msg=msg, args=(), exc_info=None,
)
# Match the record factory that hermes_logging installs at import time.
record.session_tag = ""
handler.emit(record)
handler.flush()
def test_recovers_after_external_rename(self, tmp_path):
"""logrotate-style external rename: ``mv gateway.log gateway.log.1``.
Handler's fd was pinned to the renamed inode; new writes used to
go to ``gateway.log.1`` forever. After fix, the handler reopens
``gateway.log`` at the original path.
"""
log_path = tmp_path / "gateway.log"
rotated = tmp_path / "gateway.log.1"
handler = self._make_handler(log_path)
try:
self._emit(handler, "before rotation")
assert log_path.read_text() == "before rotation\n"
# External rotation (NOT via handler.doRollover()).
os.rename(log_path, rotated)
assert not log_path.exists()
self._emit(handler, "after rotation")
# The new write should land in a freshly recreated gateway.log,
# not appended to the rotated backup.
assert log_path.exists(), "handler did not recreate gateway.log"
assert log_path.read_text() == "after rotation\n"
assert rotated.read_text() == "before rotation\n"
finally:
handler.close()
def test_recovers_after_external_unlink(self, tmp_path):
"""``rm gateway.log`` then keep writing — handler recreates the file."""
log_path = tmp_path / "gateway.log"
handler = self._make_handler(log_path)
try:
self._emit(handler, "before unlink")
assert log_path.read_text() == "before unlink\n"
os.unlink(log_path)
assert not log_path.exists()
self._emit(handler, "after unlink")
assert log_path.exists()
assert log_path.read_text() == "after unlink\n"
finally:
handler.close()
def test_external_truncate_does_not_force_reopen(self, tmp_path):
"""``: > gateway.log`` keeps the same inode — no reopen needed.
Truncation in place preserves the inode, so subsequent writes
continue to the same file descriptor. We assert the post-truncate
content reflects the truncate (size shrinks) and then grows with
new writes i.e. the handler correctly does NOT detect this as
an inode change.
"""
log_path = tmp_path / "gateway.log"
handler = self._make_handler(log_path)
try:
self._emit(handler, "AAAA" * 32)
assert log_path.stat().st_size > 0
with open(log_path, "w"):
pass # truncate to zero
assert log_path.stat().st_size == 0
self._emit(handler, "after truncate")
assert log_path.read_text() == "after truncate\n"
finally:
handler.close()
def test_normal_rollover_still_works(self, tmp_path):
"""Handler-driven ``doRollover()`` must continue to work normally.
Regression guard: the inode-snapshot bookkeeping must be refreshed
in ``doRollover()`` so the very next emit doesn't mistake our own
rollover for an external one and double-reopen.
"""
log_path = tmp_path / "gateway.log"
rotated = tmp_path / "gateway.log.1"
# Tiny maxBytes forces rollover after the first record.
handler = hermes_logging._ManagedRotatingFileHandler(
str(log_path), maxBytes=1, backupCount=1, encoding="utf-8",
)
handler.setLevel(logging.INFO)
handler.setFormatter(logging.Formatter("%(message)s"))
try:
self._emit(handler, "first record")
self._emit(handler, "second record")
self._emit(handler, "third record")
# After rollover we should have BOTH files, with the most
# recent record in the live file.
assert log_path.exists()
assert rotated.exists()
assert "third record" in log_path.read_text()
finally:
handler.close()
def test_gateway_log_attached_after_external_rotation_then_re_setup(
self, hermes_home,
):
"""End-to-end Allen-reproduction: gateway.log gets externally rotated,
``setup_logging(mode='gateway')`` is re-called, the handler keeps
working.
Reproduces Allen's symptom (gateway.log frozen mid-write, all gateway
records leaking to agent.log) when something external rotates the
file between setup_logging() calls.
"""
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway")
gw_path = hermes_home / "logs" / "gateway.log"
rotated = hermes_home / "logs" / "gateway.log.1"
logging.getLogger("gateway.run").info("line BEFORE rotation")
for h in logging.getLogger().handlers:
try: h.flush()
except Exception: pass
assert "BEFORE rotation" in gw_path.read_text()
# External actor renames the file out from under us.
os.rename(gw_path, rotated)
assert not gw_path.exists()
# Caller (or some restart path) re-enters setup_logging. This used
# to silently no-op due to the per-path dedup check, leaving the
# stale fd in place.
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway")
logging.getLogger("gateway.run").info("line AFTER rotation")
for h in logging.getLogger().handlers:
try: h.flush()
except Exception: pass
# The new record must reach the live gateway.log, not the rotated
# backup. Allen's logs had everything past the rotation point
# going into agent.log only, never gateway.log.
assert gw_path.exists(), "gateway.log was never recreated"
assert "AFTER rotation" in gw_path.read_text()
assert "AFTER rotation" not in rotated.read_text()