hermes-agent/tests/hermes_cli/test_auth_toctou_file_modes.py
Teknium f5ee780124 test: migrate stale os.kill monkeypatches to gateway.status._pid_exists
PR #21561 migrated liveness probes across 14 call sites from
`os.kill(pid, 0)` to `gateway.status._pid_exists` (psutil-first) so
the gateway doesn't Ctrl+C-itself on Windows via bpo-14484. A handful of
tests still patched the old `os.kill` seam and either happened to pass
on POSIX (when PID 12345 incidentally wasn't alive on the CI worker) or
failed outright — on CI runs they surfaced as 7 flaky/stable failures.

Migrate each affected test to patch the correct seam:

- tests/tools/test_browser_orphan_reaper.py (5 tests)
    Patch `gateway.status._pid_exists` instead of `os.kill`.
    Rename test_permission_error_on_kill_check_skips to
    test_alive_legacy_daemon_is_reaped — the old assertion was
    "PermissionError on sig 0 → skip dir"; post-migration the
    untracked-alive-daemon path always reaps the dir after SIGTERM
    (best-effort semantics were preserved).

- tests/tools/test_windows_native_support.py (4 tests)
    Replace tests that asserted `os.kill` seam behavior with tests
    that exercise `ProcessRegistry._is_host_pid_alive` as a
    delegator and split out a new TestPidExistsOSErrorWidening class
    that hits `gateway.status._pid_exists` directly via the POSIX
    fallback branch (so Windows-style `OSError(WinError 87)` + `PermissionError`
    widening is still covered on Linux CI).

- tests/tools/test_process_registry.py (1 test)
    Mock `psutil.Process` + `_pid_exists` instead of `os.kill`
    for the detached-session kill path.

- tests/tools/test_mcp_stability.py::test_kill_orphaned_uses_sigkill_when_available
    SIGTERM → alive-check → SIGKILL flow now uses `_pid_exists`
    for the middle step; assertion count drops from 3 to 2.

- tests/gateway/test_status.py::TestScopedLocks (2 tests)
    `acquire_scoped_lock` consults `_pid_exists`; patch that
    seam directly instead of trying to control the nested psutil
    call via os.kill monkeypatch.

- tests/hermes_cli/test_gateway.py::test_stop_profile_gateway_keeps_pid_file_when_process_still_running
    The stop loop sends one SIGTERM via os.kill then polls 20x via
    _pid_exists; instrument both separately. Old assertion
    `calls["kill"] == 21` split into `kill == 1` + `alive_probes == 20`.

- tests/hermes_cli/test_auth_toctou_file_modes.py::test_shared_nous_store_writes_0o600_with_0o700_parent
    Commit c34884ea2 switched the pytest seat-belt guard in
    `_nous_shared_store_path()` from `Path.home() / ".hermes"`
    to `get_default_hermes_root()`, which honors HERMES_HOME. The
    test sets both HERMES_HOME and HERMES_SHARED_AUTH_DIR to
    subpaths of the same tmp_path, and the override now collapses
    onto the same path the guard is refusing. Renamed the override
    subdirectory so the two paths diverge — guard passes, test runs.

All 21 original CI failures and their local-flaky siblings now pass
(278 tests across the touched files, 0 failures).
2026-05-08 14:27:40 -07:00

202 lines
7.7 KiB
Python

"""Regression tests for TOCTOU-safe credential file writers in ``hermes_cli.auth``.
Background
==========
The three writers below used to create a temp file via ``Path.write_text`` /
``Path.open('w')`` and only ``chmod``'d it to ``0o600`` afterward. Between
create and chmod the file existed at the process umask (typically ``0o644``),
briefly exposing OAuth tokens to other local users on multi-user hosts. The
fix switches them to ``os.open(O_EXCL, mode=0o600)`` + ``os.fdopen`` +
``fsync`` so the file is atomic at ``0o600`` on creation. Mirrors the fixes
shipped for ``agent/google_oauth.py`` (#19673) and ``tools/mcp_oauth.py``
(#21148).
These tests stay green only while the token file and its parent directory
end up at ``0o600`` / ``0o700`` after every write. POSIX-only — the mode-bit
enforcement does not exist on Windows.
"""
from __future__ import annotations
import json
import os
import stat
import sys
from unittest.mock import patch
import pytest
pytestmark = pytest.mark.skipif(
sys.platform.startswith("win"),
reason="POSIX mode bits not enforced on Windows",
)
# ---------------------------------------------------------------------------
# _save_auth_store (~/.hermes/auth.json — every native OAuth provider)
# ---------------------------------------------------------------------------
def test_save_auth_store_writes_0o600_with_0o700_parent(tmp_path, monkeypatch):
"""``_save_auth_store`` must land ``auth.json`` at 0o600 and parent at 0o700."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
old_umask = os.umask(0o022) # make the race observable if it regresses
try:
from hermes_cli import auth as auth_mod
auth_store = {
"version": auth_mod.AUTH_STORE_VERSION,
"providers": {"openai-codex": {"tokens": {"access_token": "secret-x"}}},
"active_provider": "openai-codex",
}
auth_path = auth_mod._save_auth_store(auth_store)
finally:
os.umask(old_umask)
mode = stat.S_IMODE(auth_path.stat().st_mode)
parent_mode = stat.S_IMODE(auth_path.parent.stat().st_mode)
assert mode == 0o600, (
f"auth.json mode 0o{mode:o} != 0o600 — TOCTOU race regressed"
)
assert parent_mode == 0o700, (
f"auth.json parent dir mode 0o{parent_mode:o} != 0o700 — siblings can traverse"
)
# Content survived the rewrite
data = json.loads(auth_path.read_text())
assert data["providers"]["openai-codex"]["tokens"]["access_token"] == "secret-x"
# ---------------------------------------------------------------------------
# _save_qwen_cli_tokens (Qwen CLI OAuth tokens)
# ---------------------------------------------------------------------------
def test_save_qwen_cli_tokens_writes_0o600_with_0o700_parent(tmp_path, monkeypatch):
"""``_save_qwen_cli_tokens`` must land the token file at 0o600 and parent at 0o700."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# The Qwen CLI auth path lives under $HOME/.qwen by default — isolate it.
monkeypatch.setenv("HOME", str(tmp_path))
old_umask = os.umask(0o022)
try:
from hermes_cli import auth as auth_mod
tokens = {
"access_token": "qwen-secret",
"refresh_token": "qwen-refresh",
"token_type": "Bearer",
"expiry_date": 123,
}
auth_path = auth_mod._save_qwen_cli_tokens(tokens)
finally:
os.umask(old_umask)
mode = stat.S_IMODE(auth_path.stat().st_mode)
parent_mode = stat.S_IMODE(auth_path.parent.stat().st_mode)
assert mode == 0o600, (
f"Qwen token file mode 0o{mode:o} != 0o600 — TOCTOU race regressed"
)
assert parent_mode == 0o700, (
f"Qwen token parent dir mode 0o{parent_mode:o} != 0o700"
)
data = json.loads(auth_path.read_text())
assert data["access_token"] == "qwen-secret"
# ---------------------------------------------------------------------------
# Nous shared-credential store write (inside _write_shared_nous_state)
# ---------------------------------------------------------------------------
def test_shared_nous_store_writes_0o600_with_0o700_parent(tmp_path, monkeypatch):
"""The Nous shared-credential store must land at 0o600 / parent 0o700."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
# _nous_shared_store_path() refuses to touch the real shared store during
# pytest runs; redirect it into tmp_path explicitly. Use a distinct
# subdirectory name (``shared_override``) so the guard's "real user
# home" reference — which currently tracks HERMES_HOME via
# get_default_hermes_root() — can't collide with our override and
# falsely claim we're writing to the real user's shared store.
monkeypatch.setenv("HERMES_SHARED_AUTH_DIR", str(tmp_path / "shared_override"))
old_umask = os.umask(0o022)
try:
from hermes_cli import auth as auth_mod
state = {
"access_token": "nous-access-xxx",
"refresh_token": "nous-refresh-xxx",
"token_type": "Bearer",
"scope": "openid profile",
"client_id": "test-client",
"obtained_at": "2026-01-01T00:00:00Z",
"expires_at": "2026-01-01T01:00:00Z",
}
auth_mod._write_shared_nous_state(state)
path = auth_mod._nous_shared_store_path()
finally:
os.umask(old_umask)
assert path.exists(), "shared Nous store was not written"
mode = stat.S_IMODE(path.stat().st_mode)
parent_mode = stat.S_IMODE(path.parent.stat().st_mode)
assert mode == 0o600, (
f"Nous shared store mode 0o{mode:o} != 0o600 — TOCTOU race regressed"
)
assert parent_mode == 0o700, (
f"Nous shared store parent dir mode 0o{parent_mode:o} != 0o700"
)
data = json.loads(path.read_text())
assert data["refresh_token"] == "nous-refresh-xxx"
# ---------------------------------------------------------------------------
# Atomicity: verify ``os.open`` is called with an explicit 0o600 mode.
# ---------------------------------------------------------------------------
def test_save_auth_store_uses_os_open_with_0o600_mode(tmp_path, monkeypatch):
"""Regression: the writer must call ``os.open`` with an explicit restricted
mode so the file is created at 0o600 atomically — closing the TOCTOU
window the previous ``Path.open('w')`` left open (fd inherited process
umask and was briefly 0o644 before post-write chmod)."""
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
observed_opens: list[tuple[str, int, int]] = []
real_os_open = os.open
def spying_os_open(path, flags, mode=0o777, *args, **kwargs):
observed_opens.append((str(path), flags, mode))
return real_os_open(path, flags, mode, *args, **kwargs)
with patch.object(os, "open", spying_os_open):
from hermes_cli import auth as auth_mod
auth_mod._save_auth_store(
{"version": auth_mod.AUTH_STORE_VERSION, "providers": {}}
)
auth_tmp_opens = [
(p, fl, m) for (p, fl, m) in observed_opens if "auth.json.tmp" in p
]
assert auth_tmp_opens, (
f"os.open was never called for the auth.json temp file; "
f"observed={observed_opens!r}"
)
for path, flags, mode in auth_tmp_opens:
assert flags & os.O_CREAT, f"auth.json temp open missing O_CREAT: path={path}"
assert flags & os.O_EXCL, (
f"auth.json temp open missing O_EXCL — TOCTOU-safe pattern regressed: "
f"path={path}, flags={flags}"
)
# Must be exactly S_IRUSR | S_IWUSR (0o600) — no group/other bits.
expected = stat.S_IRUSR | stat.S_IWUSR
assert mode == expected, (
f"auth.json temp open mode 0o{mode:o} != 0o{expected:o}"
f"umask would apply and potentially expose tokens"
)