fix(gateway): namespace --replace takeover marker by HERMES_HOME to stop cross-profile flap (#29092)

Two profile gateway services sharing the default ~/.hermes resolve the
takeover marker to the same path. A --replace from profile B could land
in profile A's marker, match on PID + start_time by coincidence of a
shared PID namespace, and make profile A exit 0 — only to be revived by
systemd Restart=always, which races the replacer again, flapping
indefinitely.

write_takeover_marker now stamps replacer_hermes_home; the shared
consume path rejects markers written under a different HERMES_HOME and
leaves them in place for the correct profile. Absent field (older
markers) is treated as same-home, so single-profile and mixed old/new
deployments are unaffected.

Salvaged from #31414 by @CryptoByz onto current main (branch was ~3962
commits behind; the consume function had since been refactored for
issue #34597). Co-authored-by: CryptoByz.
This commit is contained in:
teknium1 2026-06-27 19:30:07 -07:00 committed by Teknium
parent 28ed883959
commit 2b73dd1ca6
2 changed files with 75 additions and 0 deletions

View file

@ -1133,6 +1133,22 @@ def _consume_pid_marker_for_self(
pass
return False
# Cross-profile guard (#29092): reject markers written by a gateway
# running under a different HERMES_HOME. When two profile gateway
# services share the same default ~/.hermes (HERMES_HOME not set
# distinctly), the marker path resolves to the same file for both. A
# --replace from profile B could land in profile A's marker, match on
# PID + start_time by coincidence of a shared PID namespace, and make
# profile A exit 0 — only to be revived by systemd Restart=always,
# which then races the replacer again, flapping indefinitely. The
# field is absent in markers written by older Hermes versions; treat
# absent as "same home" so old markers and single-profile setups are
# unaffected. Leave a mismatched marker in place so the correct
# profile can still consume it.
replacer_home = record.get("replacer_hermes_home")
if replacer_home is not None and replacer_home != str(get_hermes_home()):
return False
our_pid = os.getpid()
our_start_time = _get_process_start_time(our_pid)
# Start-time is a PID-reuse guard. It is only meaningful when both
@ -1179,6 +1195,7 @@ def write_takeover_marker(target_pid: int) -> bool:
"target_pid": target_pid,
"target_start_time": target_start_time,
"replacer_pid": os.getpid(),
"replacer_hermes_home": str(get_hermes_home()),
"written_at": _utc_now_iso(),
}
_write_json_file(_get_takeover_marker_path(), record)

View file

@ -1128,6 +1128,64 @@ class TestTakeoverMarker:
# We are not the target — must NOT consume as planned
assert result is False
def test_write_marker_records_replacer_hermes_home(self, tmp_path, monkeypatch):
"""The marker stamps the replacer's HERMES_HOME for cross-profile guard (#29092)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
status.write_takeover_marker(target_pid=12345)
payload = json.loads((tmp_path / ".gateway-takeover.json").read_text())
assert payload["replacer_hermes_home"] == str(tmp_path)
def test_consume_rejects_marker_from_different_profile(self, tmp_path, monkeypatch):
"""Regression (#29092): a marker written by a gateway under a DIFFERENT
HERMES_HOME must be rejected even when PID + start_time coincidentally
match otherwise two profile services sharing a default ~/.hermes flap
each other in an infinite SIGTERM/Restart loop. The mismatched marker is
left in place so the profile it was actually meant for can consume it.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
marker_path = tmp_path / ".gateway-takeover.json"
from datetime import datetime, timezone
# Marker names OUR pid + start_time (the coincidental match the bug
# relied on) but was written by a gateway in a different profile.
marker_path.write_text(json.dumps({
"target_pid": os.getpid(),
"target_start_time": 100,
"replacer_pid": 99999,
"replacer_hermes_home": str(tmp_path / "profiles" / "other"),
"written_at": datetime.now(timezone.utc).isoformat(),
}))
result = status.consume_takeover_marker_for_self()
assert result is False
# Left in place for the correct profile, not griefed away.
assert marker_path.exists()
def test_consume_accepts_legacy_marker_without_hermes_home(self, tmp_path, monkeypatch):
"""Back-compat (#29092): markers written by older Hermes versions have no
``replacer_hermes_home`` field; an absent field is treated as same-home so
single-profile setups and mixed old/new deployments keep working.
"""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
marker_path = tmp_path / ".gateway-takeover.json"
from datetime import datetime, timezone
marker_path.write_text(json.dumps({
"target_pid": os.getpid(),
"target_start_time": 100,
"replacer_pid": 99999,
"written_at": datetime.now(timezone.utc).isoformat(),
}))
result = status.consume_takeover_marker_for_self()
assert result is True
assert not marker_path.exists()
class TestPlannedStopMarker:
"""Tests for intentional service/manual gateway stop markers."""