diff --git a/gateway/status.py b/gateway/status.py index 8998c7a7a64..9290e0917cd 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -1133,6 +1133,22 @@ def _consume_pid_marker_for_self( pass return False + # Cross-profile guard (#29092): reject markers written by a gateway + # running under a different HERMES_HOME. When two profile gateway + # services share the same default ~/.hermes (HERMES_HOME not set + # distinctly), the marker path resolves to the same file for both. A + # --replace from profile B could land in profile A's marker, match on + # PID + start_time by coincidence of a shared PID namespace, and make + # profile A exit 0 — only to be revived by systemd Restart=always, + # which then races the replacer again, flapping indefinitely. The + # field is absent in markers written by older Hermes versions; treat + # absent as "same home" so old markers and single-profile setups are + # unaffected. Leave a mismatched marker in place so the correct + # profile can still consume it. + replacer_home = record.get("replacer_hermes_home") + if replacer_home is not None and replacer_home != str(get_hermes_home()): + return False + our_pid = os.getpid() our_start_time = _get_process_start_time(our_pid) # Start-time is a PID-reuse guard. It is only meaningful when both @@ -1179,6 +1195,7 @@ def write_takeover_marker(target_pid: int) -> bool: "target_pid": target_pid, "target_start_time": target_start_time, "replacer_pid": os.getpid(), + "replacer_hermes_home": str(get_hermes_home()), "written_at": _utc_now_iso(), } _write_json_file(_get_takeover_marker_path(), record) diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 42165b74803..3f88ff1c07c 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -1128,6 +1128,64 @@ class TestTakeoverMarker: # We are not the target — must NOT consume as planned assert result is False + def test_write_marker_records_replacer_hermes_home(self, tmp_path, monkeypatch): + """The marker stamps the replacer's HERMES_HOME for cross-profile guard (#29092).""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42) + + status.write_takeover_marker(target_pid=12345) + + payload = json.loads((tmp_path / ".gateway-takeover.json").read_text()) + assert payload["replacer_hermes_home"] == str(tmp_path) + + def test_consume_rejects_marker_from_different_profile(self, tmp_path, monkeypatch): + """Regression (#29092): a marker written by a gateway under a DIFFERENT + HERMES_HOME must be rejected even when PID + start_time coincidentally + match — otherwise two profile services sharing a default ~/.hermes flap + each other in an infinite SIGTERM/Restart loop. The mismatched marker is + left in place so the profile it was actually meant for can consume it. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100) + marker_path = tmp_path / ".gateway-takeover.json" + from datetime import datetime, timezone + # Marker names OUR pid + start_time (the coincidental match the bug + # relied on) but was written by a gateway in a different profile. + marker_path.write_text(json.dumps({ + "target_pid": os.getpid(), + "target_start_time": 100, + "replacer_pid": 99999, + "replacer_hermes_home": str(tmp_path / "profiles" / "other"), + "written_at": datetime.now(timezone.utc).isoformat(), + })) + + result = status.consume_takeover_marker_for_self() + + assert result is False + # Left in place for the correct profile, not griefed away. + assert marker_path.exists() + + def test_consume_accepts_legacy_marker_without_hermes_home(self, tmp_path, monkeypatch): + """Back-compat (#29092): markers written by older Hermes versions have no + ``replacer_hermes_home`` field; an absent field is treated as same-home so + single-profile setups and mixed old/new deployments keep working. + """ + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100) + marker_path = tmp_path / ".gateway-takeover.json" + from datetime import datetime, timezone + marker_path.write_text(json.dumps({ + "target_pid": os.getpid(), + "target_start_time": 100, + "replacer_pid": 99999, + "written_at": datetime.now(timezone.utc).isoformat(), + })) + + result = status.consume_takeover_marker_for_self() + + assert result is True + assert not marker_path.exists() + class TestPlannedStopMarker: """Tests for intentional service/manual gateway stop markers."""