mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-07-01 12:02:05 +00:00
fix(gateway): namespace --replace takeover marker by HERMES_HOME to stop cross-profile flap (#29092)
Two profile gateway services sharing the default ~/.hermes resolve the takeover marker to the same path. A --replace from profile B could land in profile A's marker, match on PID + start_time by coincidence of a shared PID namespace, and make profile A exit 0 — only to be revived by systemd Restart=always, which races the replacer again, flapping indefinitely. write_takeover_marker now stamps replacer_hermes_home; the shared consume path rejects markers written under a different HERMES_HOME and leaves them in place for the correct profile. Absent field (older markers) is treated as same-home, so single-profile and mixed old/new deployments are unaffected. Salvaged from #31414 by @CryptoByz onto current main (branch was ~3962 commits behind; the consume function had since been refactored for issue #34597). Co-authored-by: CryptoByz.
This commit is contained in:
parent
28ed883959
commit
2b73dd1ca6
2 changed files with 75 additions and 0 deletions
|
|
@ -1133,6 +1133,22 @@ def _consume_pid_marker_for_self(
|
|||
pass
|
||||
return False
|
||||
|
||||
# Cross-profile guard (#29092): reject markers written by a gateway
|
||||
# running under a different HERMES_HOME. When two profile gateway
|
||||
# services share the same default ~/.hermes (HERMES_HOME not set
|
||||
# distinctly), the marker path resolves to the same file for both. A
|
||||
# --replace from profile B could land in profile A's marker, match on
|
||||
# PID + start_time by coincidence of a shared PID namespace, and make
|
||||
# profile A exit 0 — only to be revived by systemd Restart=always,
|
||||
# which then races the replacer again, flapping indefinitely. The
|
||||
# field is absent in markers written by older Hermes versions; treat
|
||||
# absent as "same home" so old markers and single-profile setups are
|
||||
# unaffected. Leave a mismatched marker in place so the correct
|
||||
# profile can still consume it.
|
||||
replacer_home = record.get("replacer_hermes_home")
|
||||
if replacer_home is not None and replacer_home != str(get_hermes_home()):
|
||||
return False
|
||||
|
||||
our_pid = os.getpid()
|
||||
our_start_time = _get_process_start_time(our_pid)
|
||||
# Start-time is a PID-reuse guard. It is only meaningful when both
|
||||
|
|
@ -1179,6 +1195,7 @@ def write_takeover_marker(target_pid: int) -> bool:
|
|||
"target_pid": target_pid,
|
||||
"target_start_time": target_start_time,
|
||||
"replacer_pid": os.getpid(),
|
||||
"replacer_hermes_home": str(get_hermes_home()),
|
||||
"written_at": _utc_now_iso(),
|
||||
}
|
||||
_write_json_file(_get_takeover_marker_path(), record)
|
||||
|
|
|
|||
|
|
@ -1128,6 +1128,64 @@ class TestTakeoverMarker:
|
|||
# We are not the target — must NOT consume as planned
|
||||
assert result is False
|
||||
|
||||
def test_write_marker_records_replacer_hermes_home(self, tmp_path, monkeypatch):
|
||||
"""The marker stamps the replacer's HERMES_HOME for cross-profile guard (#29092)."""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
|
||||
|
||||
status.write_takeover_marker(target_pid=12345)
|
||||
|
||||
payload = json.loads((tmp_path / ".gateway-takeover.json").read_text())
|
||||
assert payload["replacer_hermes_home"] == str(tmp_path)
|
||||
|
||||
def test_consume_rejects_marker_from_different_profile(self, tmp_path, monkeypatch):
|
||||
"""Regression (#29092): a marker written by a gateway under a DIFFERENT
|
||||
HERMES_HOME must be rejected even when PID + start_time coincidentally
|
||||
match — otherwise two profile services sharing a default ~/.hermes flap
|
||||
each other in an infinite SIGTERM/Restart loop. The mismatched marker is
|
||||
left in place so the profile it was actually meant for can consume it.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
||||
marker_path = tmp_path / ".gateway-takeover.json"
|
||||
from datetime import datetime, timezone
|
||||
# Marker names OUR pid + start_time (the coincidental match the bug
|
||||
# relied on) but was written by a gateway in a different profile.
|
||||
marker_path.write_text(json.dumps({
|
||||
"target_pid": os.getpid(),
|
||||
"target_start_time": 100,
|
||||
"replacer_pid": 99999,
|
||||
"replacer_hermes_home": str(tmp_path / "profiles" / "other"),
|
||||
"written_at": datetime.now(timezone.utc).isoformat(),
|
||||
}))
|
||||
|
||||
result = status.consume_takeover_marker_for_self()
|
||||
|
||||
assert result is False
|
||||
# Left in place for the correct profile, not griefed away.
|
||||
assert marker_path.exists()
|
||||
|
||||
def test_consume_accepts_legacy_marker_without_hermes_home(self, tmp_path, monkeypatch):
|
||||
"""Back-compat (#29092): markers written by older Hermes versions have no
|
||||
``replacer_hermes_home`` field; an absent field is treated as same-home so
|
||||
single-profile setups and mixed old/new deployments keep working.
|
||||
"""
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
|
||||
marker_path = tmp_path / ".gateway-takeover.json"
|
||||
from datetime import datetime, timezone
|
||||
marker_path.write_text(json.dumps({
|
||||
"target_pid": os.getpid(),
|
||||
"target_start_time": 100,
|
||||
"replacer_pid": 99999,
|
||||
"written_at": datetime.now(timezone.utc).isoformat(),
|
||||
}))
|
||||
|
||||
result = status.consume_takeover_marker_for_self()
|
||||
|
||||
assert result is True
|
||||
assert not marker_path.exists()
|
||||
|
||||
|
||||
class TestPlannedStopMarker:
|
||||
"""Tests for intentional service/manual gateway stop markers."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue