This commit is contained in:
Brian D. Evans 2026-04-24 21:26:22 +00:00 committed by GitHub
commit 3472ba4e2d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 392 additions and 14 deletions

View file

@ -6,6 +6,7 @@ Handles: hermes gateway [run|start|stop|restart|status|install|uninstall|setup]
import asyncio
import os
import re
import shutil
import signal
import subprocess
@ -59,7 +60,69 @@ class GatewayRuntimeSnapshot:
def has_process_service_mismatch(self) -> bool:
return self.service_installed and self.running and not self.service_running
def _get_service_pids() -> set:
# ``launchctl list <label>`` on macOS returns a plist-dict line that looks
# like ``\t"PID" = 855;``. The only integer that must be on that line is
# the PID, so a narrow regex is safer than string-splitting on quotes and
# semicolons (which breaks when keys get new spacing or quoting in future
# macOS releases). Anchored to ``"PID"`` so sibling keys like
# ``LastExitStatus`` can't match.
_LAUNCHD_PLIST_PID_RE = re.compile(r'"PID"\s*=\s*(\d+)\s*;')
def _parse_launchd_list_output(stdout: str, label: str) -> set[int]:
"""Extract PIDs for ``label`` from ``launchctl list`` output.
macOS ``launchctl list`` has two distinct output formats and which one
you get depends on whether a label is passed:
* **No label argument** tab-separated table ``PID\\tStatus\\tLabel``,
one row per service, header line first.
* **With label argument** plist-dict dump of the single service,
e.g. ``"PID" = 855;\\n"Label" = "ai.hermes.gateway";`` (#15225).
The original implementation only handled the first format, so when
called with a label the data line parsed as a dict key/value and
``parts[2]`` (``'"ai.hermes.gateway";'``) never equalled the bare
label string no PID ever got extracted. This helper accepts both
formats and returns the extracted PIDs as a set. Filtering on
``label`` (to avoid grabbing an unrelated service's PID from a
no-arg dump) is done only when the tab-separated path is taken,
because the plist-dict dump is already scoped to the label the
caller requested.
"""
pids: set[int] = set()
if not stdout:
return pids
# Plist-dict format: ``"PID" = NNN;`` lines. ``findall`` returns all
# matches as strings — we trust ``int()`` since the regex capture is
# ``\d+``.
for match in _LAUNCHD_PLIST_PID_RE.findall(stdout):
try:
pid = int(match)
except ValueError:
continue
if pid > 0:
pids.add(pid)
if pids:
return pids
# Tab-separated fallback (no-label dump). Header line starts with
# ``PID\tStatus\tLabel`` so only rows whose third column matches our
# label contribute.
for line in stdout.strip().splitlines():
parts = line.split()
if len(parts) >= 3 and parts[2] == label:
try:
pid = int(parts[0])
except ValueError:
continue
if pid > 0:
pids.add(pid)
return pids
def _get_service_pids() -> set[int]:
"""Return PIDs currently managed by systemd or launchd gateway services.
Used to avoid killing freshly-restarted service processes when sweeping
@ -67,7 +130,7 @@ def _get_service_pids() -> set:
service manager having committed the new PID before the restart command
returns (true for both systemd and launchd in practice).
"""
pids: set = set()
pids: set[int] = set()
# --- systemd (Linux): user and system scopes ---
if supports_systemd_services():
@ -106,16 +169,7 @@ def _get_service_pids() -> set:
capture_output=True, text=True, timeout=5,
)
if result.returncode == 0:
# Output: "PID\tStatus\tLabel" header, then one data line
for line in result.stdout.strip().splitlines():
parts = line.split()
if len(parts) >= 3 and parts[2] == label:
try:
pid = int(parts[0])
if pid > 0:
pids.add(pid)
except ValueError:
pass
pids |= _parse_launchd_list_output(result.stdout, label)
except (FileNotFoundError, subprocess.TimeoutExpired):
pass
@ -299,8 +353,16 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
pass
current_cmd = ""
else:
# ``-A -ww`` (listing every process, unlimited line width) is
# portable across Linux/procps, BSD/Darwin, and busybox. The
# old invocation passed ``eww`` as a positional argument — on
# Darwin that's rejected as "illegal argument" (#15225) and on
# FreeBSD the embedded ``e`` prepended environment variables
# to the command column, which broke ``split(None, 1)`` there
# (#9069). Dropping the ``e`` also stops leaking env vars
# (API keys, tokens) into the command string we parse.
result = subprocess.run(
["ps", "-A", "eww", "-o", "pid=,command="],
["ps", "-A", "-ww", "-o", "pid=,command="],
capture_output=True,
text=True,
timeout=10,

View file

@ -261,7 +261,9 @@ def test_find_gateway_pids_falls_back_to_pid_file_when_process_scan_fails(monkey
monkeypatch.setattr("gateway.status.get_running_pid", lambda: 321)
def fake_run(cmd, **kwargs):
if cmd[:4] == ["ps", "-A", "eww", "-o"]:
# ``-A -ww`` is the portable form since #15225 — earlier revisions
# passed ``eww`` which Darwin rejects as illegal.
if cmd[:4] == ["ps", "-A", "-ww", "-o"]:
return SimpleNamespace(returncode=1, stdout="", stderr="ps failed")
raise AssertionError(f"Unexpected command: {cmd}")

View file

@ -0,0 +1,314 @@
"""Regression guard for #15225 — macOS gateway-pid detection.
Two independent bugs in ``hermes_cli/gateway.py`` caused
``find_gateway_pids()`` to return an empty list on macOS, which made
``hermes cron list`` print a false-positive "Gateway is not running"
warning even when launchd had the service loaded and cron firing was
working normally.
Bug 1 ``_get_service_pids()`` called ``launchctl list <label>`` and
then parsed the output as the tab-separated table ``PID\\tStatus\\tLabel``.
With a label argument, macOS ``launchctl`` instead returns a plist-dict
``{ "PID" = 855; "Label" = "ai.hermes.gateway"; ...}`` the old loop's
``parts[2] == label`` check never matched the quoted/semicolon'd token
so no PID was ever extracted.
Bug 2 ``_scan_gateway_pids()`` invoked ``ps -A eww -o pid=,command=``.
Darwin's ``ps`` rejects ``eww`` as an illegal positional argument
(``exit=1``, empty stdout), so the fallback PID scan also produced
nothing.
These tests pin the fixes at the production-code entry points:
* ``_parse_launchd_list_output`` (the new helper) is exercised directly
with the exact plist-dict output from the bug report AND with the
tab-separated format the old code expected both must surface the
correct PID.
* ``_get_service_pids`` is exercised end-to-end with ``subprocess.run``
patched to return the plist-dict payload, proving the macOS branch
no longer comes back empty for a running service.
* ``_scan_gateway_pids`` (and ``find_gateway_pids`` through it) is
exercised with a patched ``subprocess.run`` that records the exact
command list the implementation passed failing loudly if the
invocation ever regresses to ``eww``.
"""
from types import SimpleNamespace
import pytest
from hermes_cli import gateway
# Real-world plist-dict payload taken from the #15225 reproduction,
# minus shell-specific paths (so the fixture is portable across contributors).
_MACOS_LAUNCHCTL_LIST_LABEL_OUTPUT = """\
{
\t"StandardOutPath" = "/Users/someuser/.hermes/logs/gateway.log";
\t"LimitLoadToSessionType" = "Aqua";
\t"StandardErrorPath" = "/Users/someuser/.hermes/logs/gateway.error.log";
\t"Label" = "ai.hermes.gateway";
\t"OnDemand" = true;
\t"LastExitStatus" = 0;
\t"PID" = 855;
\t"Program" = "/Users/someuser/.hermes/hermes-agent/venv/bin/python";
\t"ProgramArguments" = (
\t\t"/Users/someuser/.hermes/hermes-agent/venv/bin/python";
\t\t"-m";
\t\t"hermes_cli";
\t\t"gateway";
\t\t"run";
\t);
};
"""
# Tab-separated format — what ``launchctl list`` (no label) produces. The
# old code path assumed this format regardless of whether a label was
# passed; the helper still has to handle it so callers that change their
# mind about whether to pass a label don't break.
_MACOS_LAUNCHCTL_LIST_TABLE_OUTPUT = """\
PID\tStatus\tLabel
855\t0\tai.hermes.gateway
-\t0\tcom.apple.somethingelse
1024\t0\tcom.other.service
"""
# ---------------------------------------------------------------------------
# Bug 1 — _parse_launchd_list_output (new helper)
# ---------------------------------------------------------------------------
class TestParseLaunchdListOutput:
def test_extracts_pid_from_plist_dict_format(self):
"""macOS returns a plist-dict when a label is passed to
``launchctl list``. This is the exact format from the #15225
reproduction. The old regex-less string.split()-based code saw
``parts[2] == '"ai.hermes.gateway";'`` (quoted + semicolon'd)
and never matched the bare label so it returned the empty
set. The new helper parses ``"PID" = N;`` directly."""
pids = gateway._parse_launchd_list_output(
_MACOS_LAUNCHCTL_LIST_LABEL_OUTPUT, "ai.hermes.gateway",
)
assert pids == {855}
def test_extracts_pid_from_tab_separated_format(self):
"""When the caller runs ``launchctl list`` without a label, the
output is a tab-separated table. The helper must still extract
only PIDs for the requested label (the table may list many
services)."""
pids = gateway._parse_launchd_list_output(
_MACOS_LAUNCHCTL_LIST_TABLE_OUTPUT, "ai.hermes.gateway",
)
assert pids == {855}
def test_skips_tab_separated_rows_with_dash_pid(self):
"""In the tab-separated format, an unloaded service appears as
``-\\t0\\tlabel``. ``int("-")`` would raise; the helper must
skip such rows cleanly and return an empty set if the requested
label is only present as a dash row."""
pids = gateway._parse_launchd_list_output(
"PID\tStatus\tLabel\n-\t0\tai.hermes.gateway\n",
"ai.hermes.gateway",
)
assert pids == set()
def test_empty_output_returns_empty_set(self):
"""A 0-length stdout (e.g. launchctl hanging, or a non-zero exit
where the caller already filtered) must never crash the helper
or synthesise fake PIDs."""
assert gateway._parse_launchd_list_output("", "ai.hermes.gateway") == set()
def test_plist_dict_wins_over_tab_separated_when_both_look_plausible(self):
"""Defensive: if some future macOS release returned *both*
shapes concatenated, trust the plist-dict (the label-scoped
answer) rather than the table (which might list sibling
services for the same user). The helper picks the plist
matches first and only falls back to the tab-separated path
when none were found."""
merged = (
_MACOS_LAUNCHCTL_LIST_LABEL_OUTPUT
+ "\nPID\tStatus\tLabel\n9999\t0\tai.hermes.gateway\n"
)
pids = gateway._parse_launchd_list_output(merged, "ai.hermes.gateway")
assert pids == {855} # from the plist-dict block, not the table
def test_pid_zero_is_rejected(self):
"""``"PID" = 0;`` would be either kernel or "service not running"
depending on the OS either way, never a real gateway PID.
Accepting it would crash downstream ``os.kill`` calls that
interpret 0 as 'this process group'."""
pids = gateway._parse_launchd_list_output(
'{\n\t"PID" = 0;\n\t"Label" = "ai.hermes.gateway";\n};\n',
"ai.hermes.gateway",
)
assert pids == set()
def test_other_label_entry_in_table_ignored(self):
"""The tab-separated path must filter on the third column
(label) a different service at the top of the table must not
contribute its PID to our result."""
text = (
"PID\tStatus\tLabel\n"
"1234\t0\tcom.apple.unrelated\n"
"855\t0\tai.hermes.gateway\n"
)
pids = gateway._parse_launchd_list_output(text, "ai.hermes.gateway")
assert pids == {855}
@pytest.mark.parametrize("spacing", [
'"PID" = 855;', # canonical
'"PID"=855;', # no spaces
'"PID" = 855 ;', # extra spaces
'\t\t"PID" = 855;', # leading tabs (real output)
])
def test_pid_regex_tolerates_whitespace_variants(self, spacing):
"""launchd's plist dumper is not a stable format — prior Apple
releases have shifted spacing. The regex must not hinge on an
exact ``"PID" = N;`` with a single space or we'll regress
silently on a future macOS update."""
text = f'{{\n\t{spacing}\n\t"Label" = "ai.hermes.gateway";\n}};\n'
pids = gateway._parse_launchd_list_output(text, "ai.hermes.gateway")
assert pids == {855}
# ---------------------------------------------------------------------------
# Bug 1 — _get_service_pids wiring (macOS branch)
# ---------------------------------------------------------------------------
class TestGetServicePidsMacOS:
"""End-to-end check that the macOS branch of ``_get_service_pids``
actually picks up the PID for a running gateway now the symptom
#15225 complained about."""
def _patch_macos(self, monkeypatch, launchctl_stdout, launchctl_rc=0):
monkeypatch.setattr(gateway, "is_macos", lambda: True)
# Avoid running systemd probes on a macOS-emulation test —
# ``supports_systemd_services`` already returns False on Darwin
# but tests don't run on Darwin by default.
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway, "get_launchd_label", lambda: "ai.hermes.gateway")
def fake_run(cmd, **kwargs):
if cmd[:3] == ["launchctl", "list", "ai.hermes.gateway"]:
return SimpleNamespace(
returncode=launchctl_rc,
stdout=launchctl_stdout,
stderr="",
)
raise AssertionError(f"Unexpected subprocess call: {cmd!r}")
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
def test_returns_pid_for_running_service(self, monkeypatch):
self._patch_macos(monkeypatch, _MACOS_LAUNCHCTL_LIST_LABEL_OUTPUT)
assert gateway._get_service_pids() == {855}
def test_returns_empty_set_when_launchctl_exits_nonzero(self, monkeypatch):
"""The caller should not record PIDs from a failed command —
e.g. launchctl returning "Could not find service" as rc=3."""
self._patch_macos(monkeypatch, "", launchctl_rc=3)
assert gateway._get_service_pids() == set()
def test_survives_launchctl_missing_on_path(self, monkeypatch):
"""If ``launchctl`` isn't installed (hermes CLI running on a
nonstandard macOS image), we must not propagate the
``FileNotFoundError`` return an empty set and let the ps
fallback try next."""
monkeypatch.setattr(gateway, "is_macos", lambda: True)
monkeypatch.setattr(gateway, "supports_systemd_services", lambda: False)
monkeypatch.setattr(gateway, "get_launchd_label", lambda: "ai.hermes.gateway")
def boom(cmd, **kwargs):
raise FileNotFoundError("launchctl")
monkeypatch.setattr(gateway.subprocess, "run", boom)
assert gateway._get_service_pids() == set()
# ---------------------------------------------------------------------------
# Bug 2 — _scan_gateway_pids ps invocation
# ---------------------------------------------------------------------------
class TestPsInvocationPortability:
"""The ``ps`` fallback invocation must be portable across Linux,
Darwin, and FreeBSD. ``eww`` was illegal on Darwin (#15225) and
leaked env vars (including API keys) on FreeBSD (#9069) — the
portable replacement is ``-A -ww`` (list all processes, wide
wide) with no ``e``.
"""
def _invoke(self, monkeypatch, *, returncode=0, stdout=""):
recorded = {"cmd": None}
def fake_run(cmd, **kwargs):
recorded["cmd"] = list(cmd)
return SimpleNamespace(returncode=returncode, stdout=stdout, stderr="")
monkeypatch.setattr(gateway, "is_windows", lambda: False)
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
# Exercise the production path. ``_scan_gateway_pids`` takes an
# exclude-set; we pass empty so nothing is filtered out.
pids = gateway._scan_gateway_pids(set(), all_profiles=True)
return recorded["cmd"], pids
def test_never_passes_eww_positional(self, monkeypatch):
"""The central regression guard: neither ``"eww"`` nor any
``e``-containing flag stack that would attach env vars should
end up on the command line. Parametrise in case a future
refactor adds an equivalent flag."""
cmd, _ = self._invoke(monkeypatch)
assert cmd is not None, "production code never invoked ps"
assert "eww" not in cmd, (
f"ps was invoked with 'eww' (illegal on Darwin, leaks env on "
f"FreeBSD). Full cmd: {cmd!r}"
)
# ``-A -ww`` is the expected replacement — each flag should appear.
assert "-A" in cmd
assert "-ww" in cmd
def test_invocation_shape_matches_expected(self, monkeypatch):
"""Pin the exact invocation shape so a refactor doesn't
accidentally alter argv ordering. ``pid=,command=`` is important
the trailing ``=`` on each field suppresses the header line
that would otherwise confuse the parser."""
cmd, _ = self._invoke(monkeypatch)
assert cmd == ["ps", "-A", "-ww", "-o", "pid=,command="], (
f"ps invocation drifted: {cmd!r}"
)
def test_parses_macos_style_output(self, monkeypatch):
"""Verify the parse loop still works for realistic Darwin ps
output ``pid=,command=`` suppresses the header and lines are
space-separated PID + command text. Command string must contain
one of the gateway-scan patterns (e.g. ``hermes_cli.main gateway``)
to be picked up same as the production matcher."""
sample = (
" 855 /Users/someuser/.hermes/hermes-agent/venv/bin/python -m hermes_cli.main gateway run\n"
" 1024 /usr/sbin/cupsd -l\n"
)
monkeypatch.setattr(gateway, "is_windows", lambda: False)
def fake_run(cmd, **kwargs):
return SimpleNamespace(returncode=0, stdout=sample, stderr="")
monkeypatch.setattr(gateway.subprocess, "run", fake_run)
pids = gateway._scan_gateway_pids(set(), all_profiles=True)
assert 855 in pids, (
f"expected PID 855 extracted from ps output, got {pids}"
)
assert 1024 not in pids, (
"non-gateway process (cupsd) must not be matched"
)
def test_returncode_nonzero_returns_empty_list(self, monkeypatch):
"""Darwin used to return exit=1 for the old ``eww`` invocation.
Even with the fix, a future platform that errors on ``-ww``
must not crash the caller returns []."""
cmd, pids = self._invoke(monkeypatch, returncode=1, stdout="")
assert pids == []
# Still invoked with the expected shape, just failed.
assert cmd == ["ps", "-A", "-ww", "-o", "pid=,command="]