diff --git a/gateway/status.py b/gateway/status.py index 0f812c23e34..a27d8c7c02e 100644 --- a/gateway/status.py +++ b/gateway/status.py @@ -190,8 +190,8 @@ def _read_process_cmdline(pid: int) -> Optional[str]: return None -def looks_like_gateway_command_line(command: str | None) -> bool: - """Return True only for a real ``gateway run`` process command line. +def _gateway_command_subcommand(command: str | None) -> str | None: + """Return the Hermes gateway lifecycle subcommand from a command line. Lifecycle decisions (is the gateway up? did restart relaunch it?) must not fire on loose substring matches. The previous ``"... gateway" in cmdline`` @@ -211,7 +211,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool: either side of the ``gateway`` subcommand. """ if not command: - return False + return None try: raw_tokens = shlex.split(command, posix=False) @@ -220,15 +220,15 @@ def looks_like_gateway_command_line(command: str | None) -> bool: # Strip surrounding quotes, normalize slashes + case per token. tokens = [t.strip("\"'").replace("\\", "/").lower() for t in raw_tokens] if not tokens: - return False + return None # Gateway-dedicated entrypoints carry no subcommand to inspect. for token in tokens: if token == "gateway/run.py" or token.endswith("/gateway/run.py"): - return True + return "run" basename = token.rsplit("/", 1)[-1] if basename in ("hermes-gateway", "hermes-gateway.exe"): - return True + return "run" joined = " ".join(tokens) has_gateway_entry = ( @@ -237,7 +237,7 @@ def looks_like_gateway_command_line(command: str | None) -> bool: or any(t.rsplit("/", 1)[-1] in ("hermes", "hermes.exe") for t in tokens) ) if not has_gateway_entry: - return False + return None # Drop profile selectors anywhere: --profile X / -p X / --profile=X / -p=X. # This consumes a profile VALUE of "gateway" too, so the real subcommand @@ -259,9 +259,28 @@ def looks_like_gateway_command_line(command: str | None) -> bool: if token != "gateway": continue if i + 1 >= len(filtered): - return True # bare `hermes gateway` defaults to `run` - return filtered[i + 1] == "run" - return False + return "run" # bare `hermes gateway` defaults to `run` + return filtered[i + 1] + return None + + +def looks_like_gateway_command_line(command: str | None) -> bool: + """Return True only for a real ``gateway run`` process command line.""" + return _gateway_command_subcommand(command) == "run" + + +def looks_like_gateway_runtime_command_line(command: str | None) -> bool: + """Return True for command lines that can host the gateway runtime. + + ``gateway restart`` is normally a management command, not the gateway + runtime. On hosts without a service manager, though, the manual restart + fallback executes ``run_gateway()`` in that same process, so its argv stays + as ``gateway restart`` while it owns the webhook port and writes runtime + state. Keep the public ``looks_like_gateway_command_line()`` strict, and + use this broader matcher only when validating Hermes-owned runtime records + or no-supervisor cleanup scans. + """ + return _gateway_command_subcommand(command) in {"run", "restart"} def _looks_like_gateway_process(pid: int) -> bool: @@ -282,7 +301,7 @@ def _record_looks_like_gateway(record: dict[str, Any]) -> bool: return False cmdline = " ".join(str(part) for part in argv) - return looks_like_gateway_command_line(cmdline) + return looks_like_gateway_runtime_command_line(cmdline) def _build_pid_record() -> dict: @@ -1213,6 +1232,10 @@ def get_running_pid( resolved_lock_path = _get_gateway_lock_path(resolved_pid_path) lock_active = is_gateway_runtime_lock_active(resolved_lock_path) if not lock_active: + if pid_path is None: + runtime_pid = get_runtime_status_running_pid() + if runtime_pid is not None: + return runtime_pid _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale) return None @@ -1236,6 +1259,10 @@ def get_running_pid( return pid _cleanup_invalid_pid_path(resolved_pid_path, cleanup_stale=cleanup_stale) + if pid_path is None: + runtime_pid = get_runtime_status_running_pid() + if runtime_pid is not None: + return runtime_pid return None diff --git a/hermes_cli/gateway.py b/hermes_cli/gateway.py index 03435eac028..64d7f71f3e4 100644 --- a/hermes_cli/gateway.py +++ b/hermes_cli/gateway.py @@ -308,7 +308,11 @@ def _append_unique_pid( pids.append(pid) -def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> list[int]: +def _scan_gateway_pids( + exclude_pids: set[int], + all_profiles: bool = False, + include_restart_managers: bool = False, +) -> list[int]: """Best-effort process-table scan for gateway PIDs. This supplements the profile-scoped PID file so status views can still spot @@ -325,7 +329,10 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li # scan no longer false-matches ``gateway status``/``dashboard`` siblings or # unrelated processes like ``python -m tui_gateway``. Lazy import mirrors the # circular-import avoidance used elsewhere in this module. - from gateway.status import looks_like_gateway_command_line + from gateway.status import ( + looks_like_gateway_command_line, + looks_like_gateway_runtime_command_line, + ) current_home = str(get_hermes_home().resolve()) current_home_lc = current_home.lower() current_profile_arg = _profile_arg(current_home) @@ -357,6 +364,11 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li return False return True + def _matches_gateway_runtime(command: str) -> bool: + if looks_like_gateway_command_line(command): + return True + return include_restart_managers and looks_like_gateway_runtime_command_line(command) + try: if is_windows(): # Prefer wmic when present (fast, stable output format). On @@ -420,7 +432,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li current_cmd = line[len("CommandLine=") :] elif line.startswith("ProcessId="): pid_str = line[len("ProcessId=") :] - if looks_like_gateway_command_line(current_cmd) and ( + if _matches_gateway_runtime(current_cmd) and ( all_profiles or _matches_current_profile(current_cmd) ): try: @@ -445,7 +457,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li with open(f"/proc/{pid}/cmdline", "rb") as _f: cmdline = _f.read().decode("utf-8", errors="replace") cmdline = cmdline.replace("\x00", " ") - if looks_like_gateway_command_line(cmdline) and ( + if _matches_gateway_runtime(cmdline) and ( all_profiles or _matches_current_profile(cmdline) ): _append_unique_pid(pids, pid, exclude_pids) @@ -488,7 +500,7 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li if pid is None: continue - if looks_like_gateway_command_line(command) and ( + if _matches_gateway_runtime(command) and ( all_profiles or _matches_current_profile(command) ): _append_unique_pid(pids, pid, exclude_pids) @@ -567,7 +579,15 @@ def find_gateway_pids( pass for pid in _get_service_pids(): _append_unique_pid(pids, pid, _exclude) - for pid in _scan_gateway_pids(_exclude, all_profiles=all_profiles): + try: + include_restart_managers = not supports_systemd_services() + except Exception: + include_restart_managers = False + for pid in _scan_gateway_pids( + _exclude, + all_profiles=all_profiles, + include_restart_managers=include_restart_managers, + ): _append_unique_pid(pids, pid, _exclude) return pids diff --git a/tests/gateway/test_gateway_command_line_matcher.py b/tests/gateway/test_gateway_command_line_matcher.py index bc8113b91a0..6482c2f86f2 100644 --- a/tests/gateway/test_gateway_command_line_matcher.py +++ b/tests/gateway/test_gateway_command_line_matcher.py @@ -11,7 +11,10 @@ from __future__ import annotations import pytest -from gateway.status import looks_like_gateway_command_line as matches +from gateway.status import ( + looks_like_gateway_command_line as matches, + looks_like_gateway_runtime_command_line as matches_runtime, +) ACCEPT = [ @@ -58,3 +61,9 @@ def test_accepts_real_gateway_run(cmd): @pytest.mark.parametrize("cmd", REJECT) def test_rejects_non_gateway_run(cmd): assert matches(cmd) is False + + +def test_runtime_matcher_accepts_no_supervisor_restart_process(): + assert matches("python -m hermes_cli.main gateway restart") is False + assert matches_runtime("python -m hermes_cli.main gateway restart") is True + assert matches_runtime("python -m hermes_cli.main gateway status") is False diff --git a/tests/gateway/test_status.py b/tests/gateway/test_status.py index 0a6129b2bb5..a70c028ca15 100644 --- a/tests/gateway/test_status.py +++ b/tests/gateway/test_status.py @@ -174,6 +174,54 @@ class TestGatewayPidState: assert status.get_running_pid() is None assert not pid_path.exists() + def test_get_running_pid_accepts_no_supervisor_restart_runtime(self, tmp_path, monkeypatch): + """WSL/no-systemd restart fallback runs the gateway in a restart argv process.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + pid_path = tmp_path / "gateway.pid" + record = { + "pid": os.getpid(), + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"], + "start_time": 123, + } + pid_path.write_text(json.dumps(record)) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr( + status, + "_read_process_cmdline", + lambda pid: "python -m hermes_cli.main gateway restart", + ) + + assert status.acquire_gateway_runtime_lock() is True + try: + assert status.get_running_pid() == os.getpid() + finally: + status.release_gateway_runtime_lock() + + def test_get_running_pid_falls_back_to_no_supervisor_runtime_state(self, tmp_path, monkeypatch): + """A live gateway_state.json PID should keep status accurate without a pidfile.""" + monkeypatch.setenv("HERMES_HOME", str(tmp_path)) + state_path = tmp_path / "gateway_state.json" + state_path.write_text(json.dumps({ + "gateway_state": "running", + "pid": os.getpid(), + "kind": "hermes-gateway", + "argv": ["python", "-m", "hermes_cli.main", "gateway", "restart"], + "start_time": 123, + })) + + monkeypatch.setattr(status.os, "kill", lambda pid, sig: None) + monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123) + monkeypatch.setattr( + status, + "_read_process_cmdline", + lambda pid: "python -m hermes_cli.main gateway restart", + ) + + assert status.get_running_pid() == os.getpid() + def test_get_running_pid_cleans_stale_metadata_from_dead_foreign_pid(self, tmp_path, monkeypatch): """Stale PID file from a *different* PID (crashed process) must still be cleaned. diff --git a/tests/hermes_cli/test_gateway.py b/tests/hermes_cli/test_gateway.py index 8f9e49c4957..ba29d2f3347 100644 --- a/tests/hermes_cli/test_gateway.py +++ b/tests/hermes_cli/test_gateway.py @@ -821,6 +821,23 @@ def test_find_gateway_pids_falls_back_to_pid_file_when_process_scan_fails(monkey assert gateway.find_gateway_pids() == [321] +def test_find_gateway_pids_includes_restart_managers_without_systemd(monkeypatch): + calls = [] + + monkeypatch.setattr(gateway, "_get_service_pids", lambda: set()) + monkeypatch.setattr("gateway.status.get_running_pid", lambda: None) + monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False) + + def fake_scan(exclude_pids, all_profiles=False, include_restart_managers=False): + calls.append((set(exclude_pids), all_profiles, include_restart_managers)) + return [708] if include_restart_managers else [] + + monkeypatch.setattr(gateway, "_scan_gateway_pids", fake_scan) + + assert gateway.find_gateway_pids(all_profiles=True) == [708] + assert calls == [(set(), True, True)] + + def test_scan_gateway_pids_detects_windows_hermes_exe_case_variants(monkeypatch): monkeypatch.setattr(gateway, "is_windows", lambda: True) monkeypatch.setattr(gateway, "_get_ancestor_pids", lambda: set()) diff --git a/tests/hermes_cli/test_gateway_proc_fallback.py b/tests/hermes_cli/test_gateway_proc_fallback.py index 6b5bb15a97e..e5cad661770 100644 --- a/tests/hermes_cli/test_gateway_proc_fallback.py +++ b/tests/hermes_cli/test_gateway_proc_fallback.py @@ -77,6 +77,43 @@ class TestProcFallback: assert 99999 not in pids mock_ps.assert_not_called() # ps must NOT be called when /proc worked + def test_detects_no_supervisor_restart_process_only_when_enabled(self): + entries = { + 12345: "python -m hermes_cli.main gateway restart", + 99999: _OTHER_CMD, + } + _isdir, _listdir, _open = _fake_proc_dir(entries) + + with ( + patch("hermes_cli.gateway.is_windows", return_value=False), + patch("os.path.isdir", side_effect=_isdir), + patch("os.listdir", side_effect=_listdir), + patch("builtins.open", side_effect=_open), + patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()), + patch("subprocess.run") as mock_ps, + ): + strict_pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True) + + _isdir, _listdir, _open = _fake_proc_dir(entries) + with ( + patch("hermes_cli.gateway.is_windows", return_value=False), + patch("os.path.isdir", side_effect=_isdir), + patch("os.listdir", side_effect=_listdir), + patch("builtins.open", side_effect=_open), + patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()), + patch("subprocess.run") as mock_ps_enabled, + ): + fallback_pids = gateway_mod._scan_gateway_pids( + set(), + all_profiles=True, + include_restart_managers=True, + ) + + assert strict_pids == [] + assert fallback_pids == [12345] + mock_ps.assert_not_called() + mock_ps_enabled.assert_not_called() + def test_excludes_own_pid_from_proc_scan(self): my_pid = os.getpid() entries = {my_pid: _GATEWAY_CMD}