mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-29 06:31:32 +00:00
fix(logging): recover gateway.log handler from external rotation (#34349)
External rotation (logrotate, manual `mv gateway.log gateway.log.1`, another process rotating the file) leaves `_ManagedRotatingFileHandler`'s open fd pinned to the renamed inode. All subsequent writes go to the rotated backup instead of the file every operator expects to read, producing the symptom 'gateway.log frozen mid-write while agent.log keeps growing with gateway.* records'. PR #16229 fixed the original CLI->gateway init-order bug (#8404) so the handler attaches in the first place. This is the sibling fix for what happens after attach, when something external rotates underneath us. Adds a WatchedFileHandler-style inode check on emit(): if baseFilename no longer matches the open stream's (dev,ino), close the stale fd and reopen at the expected path. doRollover() refreshes the snapshot so our own rollover isn't misidentified as external. Five regression tests cover the matrix: external rename, external unlink, external truncate (must NOT trigger reopen — inode unchanged), normal doRollover() (must still work), and the end-to-end Allen-reproduction (rotate + re-call setup_logging). 55/55 tests in tests/test_hermes_logging.py pass; 5972/5972 in tests/gateway/ pass.
This commit is contained in:
parent
a30480bd2b
commit
75d2c081c9
2 changed files with 264 additions and 6 deletions
|
|
@ -296,19 +296,39 @@ def setup_verbose_logging() -> None:
|
|||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _ManagedRotatingFileHandler(RotatingFileHandler):
|
||||
"""RotatingFileHandler that ensures group-writable perms in managed mode.
|
||||
"""RotatingFileHandler that ensures group-writable perms in managed mode
|
||||
AND survives external rotation.
|
||||
|
||||
In managed mode (NixOS), the stateDir uses setgid (2770) so new files
|
||||
inherit the hermes group. However, both _open() (initial creation) and
|
||||
doRollover() create files via open(), which uses the process umask —
|
||||
typically 0022, producing 0644. This subclass applies chmod 0660 after
|
||||
both operations so the gateway and interactive users can share log files.
|
||||
Two responsibilities:
|
||||
|
||||
1. In managed mode (NixOS), the stateDir uses setgid (2770) so new files
|
||||
inherit the hermes group. However, both ``_open()`` (initial creation)
|
||||
and ``doRollover()`` create files via ``open()``, which uses the
|
||||
process umask — typically 0022, producing 0644. This subclass applies
|
||||
``chmod 0660`` after both operations so the gateway and interactive
|
||||
users can share log files.
|
||||
|
||||
2. ``RotatingFileHandler`` keeps an open file descriptor. If anything
|
||||
rotates the file *externally* (``logrotate``, manual ``mv``,
|
||||
another process rotating under us, a transient unlink), our fd
|
||||
keeps pointing at the renamed/unlinked inode and every subsequent
|
||||
write goes to ``gateway.log.1`` instead of ``gateway.log`` — silent
|
||||
log loss for the file every operator expects to read. Before each
|
||||
emit we ``stat`` ``baseFilename`` and compare it against the open
|
||||
stream's inode; on mismatch we reopen. This is the same pattern
|
||||
as stdlib ``WatchedFileHandler.reopenIfNeeded()``, adapted for
|
||||
rotating handlers.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
from hermes_cli.config import is_managed
|
||||
self._managed = is_managed()
|
||||
super().__init__(*args, **kwargs)
|
||||
# Snapshot the inode of the currently open stream so emit() can
|
||||
# detect external rotation without an extra fstat per write.
|
||||
self._stat_dev: Optional[int] = None
|
||||
self._stat_ino: Optional[int] = None
|
||||
self._record_stream_stat()
|
||||
|
||||
def _chmod_if_managed(self):
|
||||
if self._managed:
|
||||
|
|
@ -317,6 +337,70 @@ class _ManagedRotatingFileHandler(RotatingFileHandler):
|
|||
except OSError:
|
||||
pass
|
||||
|
||||
def _record_stream_stat(self) -> None:
|
||||
"""Snapshot dev/ino of ``baseFilename`` so we can detect external rotation."""
|
||||
try:
|
||||
st = os.stat(self.baseFilename)
|
||||
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
|
||||
except OSError:
|
||||
self._stat_dev, self._stat_ino = None, None
|
||||
|
||||
def _reopen_if_externally_rotated(self) -> None:
|
||||
"""Reopen the stream when ``baseFilename`` no longer matches our fd.
|
||||
|
||||
Triggered when ``baseFilename`` was renamed (logrotate), unlinked,
|
||||
or replaced by a different inode. Silent + best-effort: any error
|
||||
falls back to the existing (possibly stale) stream so logging keeps
|
||||
working instead of dying on a stat failure.
|
||||
"""
|
||||
try:
|
||||
st = os.stat(self.baseFilename)
|
||||
except FileNotFoundError:
|
||||
# File was rotated/unlinked underneath us. Close + reopen so a
|
||||
# fresh inode is created at the expected path.
|
||||
try:
|
||||
if self.stream is not None:
|
||||
self.stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.stream = None # type: ignore[assignment]
|
||||
try:
|
||||
self.stream = self._open()
|
||||
self._record_stream_stat()
|
||||
except Exception:
|
||||
# Couldn't reopen — leave stream=None; next emit will
|
||||
# bail rather than write to a stale inode.
|
||||
pass
|
||||
return
|
||||
except OSError:
|
||||
return # transient — try again on the next emit
|
||||
|
||||
if self._stat_dev is None or self._stat_ino is None:
|
||||
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
|
||||
return
|
||||
|
||||
if (st.st_dev, st.st_ino) != (self._stat_dev, self._stat_ino):
|
||||
# baseFilename now points at a DIFFERENT inode than the one we
|
||||
# hold open. Close the old stream and open the new file.
|
||||
try:
|
||||
if self.stream is not None:
|
||||
self.stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
self.stream = None # type: ignore[assignment]
|
||||
try:
|
||||
self.stream = self._open()
|
||||
self._stat_dev, self._stat_ino = st.st_dev, st.st_ino
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def emit(self, record: logging.LogRecord) -> None:
|
||||
# Cheap-ish stat-per-record check; the kernel caches inode metadata
|
||||
# so the syscall is sub-microsecond on a hot file.
|
||||
if self.stream is not None or os.path.exists(self.baseFilename):
|
||||
self._reopen_if_externally_rotated()
|
||||
super().emit(record)
|
||||
|
||||
def _open(self):
|
||||
stream = super()._open()
|
||||
self._chmod_if_managed()
|
||||
|
|
@ -325,6 +409,9 @@ class _ManagedRotatingFileHandler(RotatingFileHandler):
|
|||
def doRollover(self):
|
||||
super().doRollover()
|
||||
self._chmod_if_managed()
|
||||
# Our own rollover writes a new baseFilename; refresh the snapshot
|
||||
# so the next emit doesn't mistake it for external rotation.
|
||||
self._record_stream_stat()
|
||||
|
||||
|
||||
def _add_rotating_handler(
|
||||
|
|
|
|||
|
|
@ -774,3 +774,174 @@ class TestReadLoggingConfig:
|
|||
|
||||
level, max_size, backup = hermes_logging._read_logging_config()
|
||||
assert level is None
|
||||
|
||||
|
||||
class TestExternalRotationRecovery:
|
||||
"""_ManagedRotatingFileHandler recovers from external rotation.
|
||||
|
||||
External rotation = anything that renames, unlinks, or replaces the
|
||||
log file without going through ``doRollover()``: logrotate, manual
|
||||
``mv``, another process rotating under us, or a transient ``rm``.
|
||||
Before this fix the open file descriptor stayed pinned to the old
|
||||
inode forever, so every subsequent write went to the rotated backup
|
||||
instead of the file the operator expects to read.
|
||||
"""
|
||||
|
||||
def _make_handler(self, log_path: Path) -> hermes_logging._ManagedRotatingFileHandler:
|
||||
handler = hermes_logging._ManagedRotatingFileHandler(
|
||||
str(log_path), maxBytes=10 * 1024 * 1024, backupCount=3,
|
||||
encoding="utf-8",
|
||||
)
|
||||
handler.setLevel(logging.INFO)
|
||||
handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
return handler
|
||||
|
||||
def _emit(self, handler: logging.Handler, msg: str) -> None:
|
||||
record = logging.LogRecord(
|
||||
name="gateway.run", level=logging.INFO, pathname="", lineno=0,
|
||||
msg=msg, args=(), exc_info=None,
|
||||
)
|
||||
# Match the record factory that hermes_logging installs at import time.
|
||||
record.session_tag = ""
|
||||
handler.emit(record)
|
||||
handler.flush()
|
||||
|
||||
def test_recovers_after_external_rename(self, tmp_path):
|
||||
"""logrotate-style external rename: ``mv gateway.log gateway.log.1``.
|
||||
|
||||
Handler's fd was pinned to the renamed inode; new writes used to
|
||||
go to ``gateway.log.1`` forever. After fix, the handler reopens
|
||||
``gateway.log`` at the original path.
|
||||
"""
|
||||
log_path = tmp_path / "gateway.log"
|
||||
rotated = tmp_path / "gateway.log.1"
|
||||
handler = self._make_handler(log_path)
|
||||
try:
|
||||
self._emit(handler, "before rotation")
|
||||
assert log_path.read_text() == "before rotation\n"
|
||||
|
||||
# External rotation (NOT via handler.doRollover()).
|
||||
os.rename(log_path, rotated)
|
||||
assert not log_path.exists()
|
||||
|
||||
self._emit(handler, "after rotation")
|
||||
|
||||
# The new write should land in a freshly recreated gateway.log,
|
||||
# not appended to the rotated backup.
|
||||
assert log_path.exists(), "handler did not recreate gateway.log"
|
||||
assert log_path.read_text() == "after rotation\n"
|
||||
assert rotated.read_text() == "before rotation\n"
|
||||
finally:
|
||||
handler.close()
|
||||
|
||||
def test_recovers_after_external_unlink(self, tmp_path):
|
||||
"""``rm gateway.log`` then keep writing — handler recreates the file."""
|
||||
log_path = tmp_path / "gateway.log"
|
||||
handler = self._make_handler(log_path)
|
||||
try:
|
||||
self._emit(handler, "before unlink")
|
||||
assert log_path.read_text() == "before unlink\n"
|
||||
|
||||
os.unlink(log_path)
|
||||
assert not log_path.exists()
|
||||
|
||||
self._emit(handler, "after unlink")
|
||||
assert log_path.exists()
|
||||
assert log_path.read_text() == "after unlink\n"
|
||||
finally:
|
||||
handler.close()
|
||||
|
||||
def test_external_truncate_does_not_force_reopen(self, tmp_path):
|
||||
"""``: > gateway.log`` keeps the same inode — no reopen needed.
|
||||
|
||||
Truncation in place preserves the inode, so subsequent writes
|
||||
continue to the same file descriptor. We assert the post-truncate
|
||||
content reflects the truncate (size shrinks) and then grows with
|
||||
new writes — i.e. the handler correctly does NOT detect this as
|
||||
an inode change.
|
||||
"""
|
||||
log_path = tmp_path / "gateway.log"
|
||||
handler = self._make_handler(log_path)
|
||||
try:
|
||||
self._emit(handler, "AAAA" * 32)
|
||||
assert log_path.stat().st_size > 0
|
||||
|
||||
with open(log_path, "w"):
|
||||
pass # truncate to zero
|
||||
assert log_path.stat().st_size == 0
|
||||
|
||||
self._emit(handler, "after truncate")
|
||||
assert log_path.read_text() == "after truncate\n"
|
||||
finally:
|
||||
handler.close()
|
||||
|
||||
def test_normal_rollover_still_works(self, tmp_path):
|
||||
"""Handler-driven ``doRollover()`` must continue to work normally.
|
||||
|
||||
Regression guard: the inode-snapshot bookkeeping must be refreshed
|
||||
in ``doRollover()`` so the very next emit doesn't mistake our own
|
||||
rollover for an external one and double-reopen.
|
||||
"""
|
||||
log_path = tmp_path / "gateway.log"
|
||||
rotated = tmp_path / "gateway.log.1"
|
||||
|
||||
# Tiny maxBytes forces rollover after the first record.
|
||||
handler = hermes_logging._ManagedRotatingFileHandler(
|
||||
str(log_path), maxBytes=1, backupCount=1, encoding="utf-8",
|
||||
)
|
||||
handler.setLevel(logging.INFO)
|
||||
handler.setFormatter(logging.Formatter("%(message)s"))
|
||||
try:
|
||||
self._emit(handler, "first record")
|
||||
self._emit(handler, "second record")
|
||||
self._emit(handler, "third record")
|
||||
|
||||
# After rollover we should have BOTH files, with the most
|
||||
# recent record in the live file.
|
||||
assert log_path.exists()
|
||||
assert rotated.exists()
|
||||
assert "third record" in log_path.read_text()
|
||||
finally:
|
||||
handler.close()
|
||||
|
||||
def test_gateway_log_attached_after_external_rotation_then_re_setup(
|
||||
self, hermes_home,
|
||||
):
|
||||
"""End-to-end Allen-reproduction: gateway.log gets externally rotated,
|
||||
``setup_logging(mode='gateway')`` is re-called, the handler keeps
|
||||
working.
|
||||
|
||||
Reproduces Allen's symptom (gateway.log frozen mid-write, all gateway
|
||||
records leaking to agent.log) when something external rotates the
|
||||
file between setup_logging() calls.
|
||||
"""
|
||||
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway")
|
||||
gw_path = hermes_home / "logs" / "gateway.log"
|
||||
rotated = hermes_home / "logs" / "gateway.log.1"
|
||||
|
||||
logging.getLogger("gateway.run").info("line BEFORE rotation")
|
||||
for h in logging.getLogger().handlers:
|
||||
try: h.flush()
|
||||
except Exception: pass
|
||||
assert "BEFORE rotation" in gw_path.read_text()
|
||||
|
||||
# External actor renames the file out from under us.
|
||||
os.rename(gw_path, rotated)
|
||||
assert not gw_path.exists()
|
||||
|
||||
# Caller (or some restart path) re-enters setup_logging. This used
|
||||
# to silently no-op due to the per-path dedup check, leaving the
|
||||
# stale fd in place.
|
||||
hermes_logging.setup_logging(hermes_home=hermes_home, mode="gateway")
|
||||
|
||||
logging.getLogger("gateway.run").info("line AFTER rotation")
|
||||
for h in logging.getLogger().handlers:
|
||||
try: h.flush()
|
||||
except Exception: pass
|
||||
|
||||
# The new record must reach the live gateway.log, not the rotated
|
||||
# backup. Allen's logs had everything past the rotation point
|
||||
# going into agent.log only, never gateway.log.
|
||||
assert gw_path.exists(), "gateway.log was never recreated"
|
||||
assert "AFTER rotation" in gw_path.read_text()
|
||||
assert "AFTER rotation" not in rotated.read_text()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue