fix(gateway): detect gateway process via /proc in Docker without procps

Salvage of NousResearch/hermes-agent#7622.

Docker images often lack procps so `ps` is unavailable.  Try reading
/proc/*/cmdline first (works in any Linux container) and fall back to
`ps -A eww` only when /proc is not present.  PermissionError on
individual PIDs is silently skipped.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Wesley Simplicio 2026-05-09 13:17:27 -03:00 committed by Teknium
parent 2ffef15675
commit 6bf7ac3185
2 changed files with 196 additions and 32 deletions

View file

@ -394,42 +394,68 @@ def _scan_gateway_pids(exclude_pids: set[int], all_profiles: bool = False) -> li
pass
current_cmd = ""
else:
result = subprocess.run(
["ps", "-A", "eww", "-o", "pid=,command="],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
return []
for line in result.stdout.split("\n"):
stripped = line.strip()
if not stripped or "grep" in stripped:
continue
# Try /proc first (works in Docker without procps installed),
# fall back to ps -A eww.
_found_via_proc = False
if os.path.isdir("/proc"):
try:
my_pid = os.getpid()
for entry in os.listdir("/proc"):
if not entry.isdigit():
continue
pid = int(entry)
if pid == my_pid or pid in exclude_pids:
continue
try:
cmdline = open(f"/proc/{pid}/cmdline", "rb").read().decode("utf-8", errors="replace")
cmdline = cmdline.replace("\x00", " ")
if any(p in cmdline for p in patterns) and (
all_profiles or _matches_current_profile(cmdline)
):
_append_unique_pid(pids, pid, exclude_pids)
except (OSError, PermissionError):
continue
_found_via_proc = True
except Exception:
pass
pid = None
command = ""
if not _found_via_proc:
result = subprocess.run(
["ps", "-A", "eww", "-o", "pid=,command="],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
return []
for line in result.stdout.split("\n"):
stripped = line.strip()
if not stripped or "grep" in stripped:
continue
parts = stripped.split(None, 1)
if len(parts) == 2:
try:
pid = int(parts[0])
command = parts[1]
except ValueError:
pid = None
pid = None
command = ""
if pid is None:
aux_parts = stripped.split()
if len(aux_parts) > 10 and aux_parts[1].isdigit():
pid = int(aux_parts[1])
command = " ".join(aux_parts[10:])
parts = stripped.split(None, 1)
if len(parts) == 2:
try:
pid = int(parts[0])
command = parts[1]
except ValueError:
pid = None
if pid is None:
continue
if any(pattern in command for pattern in patterns) and (
all_profiles or _matches_current_profile(command)
):
_append_unique_pid(pids, pid, exclude_pids)
if pid is None:
aux_parts = stripped.split()
if len(aux_parts) > 10 and aux_parts[1].isdigit():
pid = int(aux_parts[1])
command = " ".join(aux_parts[10:])
if pid is None:
continue
if any(pattern in command for pattern in patterns) and (
all_profiles or _matches_current_profile(command)
):
_append_unique_pid(pids, pid, exclude_pids)
except (OSError, subprocess.TimeoutExpired):
return []

View file

@ -0,0 +1,138 @@
"""Tests for /proc-based gateway PID detection in Docker environments.
Verifies that _scan_gateway_pids() uses /proc/*/cmdline when available
(Docker without procps) and falls back to ps only when /proc is absent.
See: NousResearch/hermes-agent#7622
"""
import os
from unittest.mock import MagicMock, patch
import hermes_cli.gateway as gateway_mod
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_GATEWAY_CMD = "python -m hermes_cli.main gateway run"
_OTHER_CMD = "python -m some_other_thing"
def _fake_proc_dir(entries: dict):
"""Return side_effects that simulate /proc: isdir → True, listdir → pids,
open(cmdline) null-delimited command bytes."""
def _isdir(path):
return str(path) == "/proc"
def _listdir(path):
if str(path) == "/proc":
return [str(pid) for pid in entries] + ["self", "version"]
raise FileNotFoundError(path)
def _open(path, mode="r", **kwargs):
path_str = str(path)
if "/cmdline" in path_str:
pid = int(path_str.split("/proc/")[1].split("/")[0])
raw = entries.get(pid, "").encode("utf-8").replace(b" ", b"\x00")
m = MagicMock()
m.read.return_value = raw
m.__enter__ = lambda s: s
m.__exit__ = MagicMock(return_value=False)
return m
raise FileNotFoundError(path)
return _isdir, _listdir, _open
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
class TestProcFallback:
"""_scan_gateway_pids reads /proc when available, skips ps."""
def test_detects_gateway_pid_via_proc(self):
my_pid = os.getpid()
entries = {
my_pid: "python -m hermes_cli.main", # own process — excluded
12345: _GATEWAY_CMD,
99999: _OTHER_CMD,
}
_isdir, _listdir, _open = _fake_proc_dir(entries)
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", side_effect=_isdir),
patch("os.listdir", side_effect=_listdir),
patch("builtins.open", side_effect=_open),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run") as mock_ps,
):
pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
assert 12345 in pids
assert 99999 not in pids
mock_ps.assert_not_called() # ps must NOT be called when /proc worked
def test_excludes_own_pid_from_proc_scan(self):
my_pid = os.getpid()
entries = {my_pid: _GATEWAY_CMD}
_isdir, _listdir, _open = _fake_proc_dir(entries)
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", side_effect=_isdir),
patch("os.listdir", side_effect=_listdir),
patch("builtins.open", side_effect=_open),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run"),
):
pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
assert my_pid not in pids
def test_falls_back_to_ps_when_proc_absent(self):
ps_output = f"12345 {_GATEWAY_CMD}\n99999 {_OTHER_CMD}\n"
mock_result = MagicMock()
mock_result.returncode = 0
mock_result.stdout = ps_output
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", return_value=False),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run", return_value=mock_result) as mock_ps,
):
pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
mock_ps.assert_called_once()
assert 12345 in pids
def test_proc_permission_error_skips_pid(self):
def _isdir(path):
return str(path) == "/proc"
def _listdir(path):
if str(path) == "/proc":
return ["12345", "self"]
raise FileNotFoundError
def _open(path, mode="r", **kwargs):
raise PermissionError("no access")
with (
patch("hermes_cli.gateway.is_windows", return_value=False),
patch("os.path.isdir", side_effect=_isdir),
patch("os.listdir", side_effect=_listdir),
patch("builtins.open", side_effect=_open),
patch("hermes_cli.gateway._get_ancestor_pids", return_value=set()),
patch("subprocess.run") as mock_ps,
):
pids = gateway_mod._scan_gateway_pids(set(), all_profiles=True)
# PermissionError swallowed — empty result, no crash
assert 12345 not in pids
mock_ps.assert_not_called() # /proc dir existed, so ps not called