From 26abac5afd431685e55db1d7b2e12ecc4f3eb064 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Tue, 21 Apr 2026 01:33:10 -0700 Subject: [PATCH] test(conftest): reset module-level state + unset platform allowlists (#13400) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes that close the remaining structural sources of CI flakes after PR #13363. ## 1. Per-test reset of module-level singletons and ContextVars Python modules are singletons per process, and pytest-xdist workers are long-lived. Module-level dicts/sets and ContextVars persist across tests on the same worker. A test that sets state in `tools.approval._session_approved` and doesn't explicitly clear it leaks that state to every subsequent test on the same worker. New `_reset_module_state` autouse fixture in `tests/conftest.py` clears: - tools.approval: _session_approved, _session_yolo, _permanent_approved, _pending, _gateway_queues, _gateway_notify_cbs, _approval_session_key - tools.interrupt: _interrupted_threads - gateway.session_context: 10 session/cron ContextVars (reset to _UNSET) - tools.env_passthrough: _allowed_env_vars_var (reset to empty set) - tools.credential_files: _registered_files_var (reset to empty dict) - tools.file_tools: _read_tracker, _file_ops_cache This was the single biggest remaining class of CI flakes. `test_command_guards::test_warn_session_approved` and `test_combined_cli_session_approves_both` were failing 12/15 recent main runs specifically because `_session_approved` carried approvals from a prior test's session into these tests' `"default"` session lookup. ## 2. Unset platform allowlist env vars in hermetic fixture `TELEGRAM_ALLOWED_USERS`, `DISCORD_ALLOWED_USERS`, and 20 other `*_ALLOWED_USERS` / `*_ALLOW_ALL_USERS` vars are now unset per-test in the same place credential env vars already are. These aren't credentials but they change gateway auth behavior; if set from any source (user shell, leaky test, CI env) they flake button-authorization tests. Fixes three `test_telegram_approval_buttons` tests that were failing across recent runs of the full gateway directory. ## 3. Two specific tests with module-level captured state - `test_signal::TestSignalPhoneRedaction`: `agent.redact._REDACT_ENABLED` is captured at module import from `HERMES_REDACT_SECRETS`, not read per-call. `monkeypatch.delenv` at test time is too late. Added `monkeypatch.setattr("agent.redact._REDACT_ENABLED", True)` per skill xdist-cross-test-pollution Pattern 5. - `test_internal_event_bypass_pairing::test_non_internal_event_without_user_triggers_pairing`: `gateway.pairing.PAIRING_DIR` is captured at module import from HERMES_HOME, so per-test HERMES_HOME redirection in conftest doesn't retroactively move it. Test now monkeypatches PAIRING_DIR directly to its tmp_path, preventing rate-limit state from prior xdist workers from letting the pairing send-call be suppressed. ## Validation - tests/tools/: 3494 pass (0 fail) including test_command_guards - tests/gateway/: 3504 pass (0 fail) across repeat runs - tests/agent/ + tests/hermes_cli/ + tests/run_agent/ + tests/tools/: 8371 pass, 37 skipped, 0 fail — full suite across directories No production code changed. --- tests/conftest.py | 126 ++++++++++++++++++ .../test_internal_event_bypass_pairing.py | 9 ++ tests/gateway/test_signal.py | 6 + 3 files changed, 141 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index ca4a9a970..0258e034f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -186,6 +186,31 @@ _HERMES_BEHAVIORAL_VARS = frozenset({ "HERMES_HOME_MODE", "BROWSER_CDP_URL", "CAMOFOX_URL", + # Platform allowlists — not credentials, but if set from any source + # (user shell, earlier leaky test, CI env), they change gateway auth + # behavior and flake button-authorization tests. + "TELEGRAM_ALLOWED_USERS", + "DISCORD_ALLOWED_USERS", + "WHATSAPP_ALLOWED_USERS", + "SLACK_ALLOWED_USERS", + "SIGNAL_ALLOWED_USERS", + "SIGNAL_GROUP_ALLOWED_USERS", + "EMAIL_ALLOWED_USERS", + "SMS_ALLOWED_USERS", + "MATTERMOST_ALLOWED_USERS", + "MATRIX_ALLOWED_USERS", + "DINGTALK_ALLOWED_USERS", + "FEISHU_ALLOWED_USERS", + "WECOM_ALLOWED_USERS", + "GATEWAY_ALLOWED_USERS", + "GATEWAY_ALLOW_ALL_USERS", + "TELEGRAM_ALLOW_ALL_USERS", + "DISCORD_ALLOW_ALL_USERS", + "WHATSAPP_ALLOW_ALL_USERS", + "SLACK_ALLOW_ALL_USERS", + "SIGNAL_ALLOW_ALL_USERS", + "EMAIL_ALLOW_ALL_USERS", + "SMS_ALLOW_ALL_USERS", }) @@ -258,6 +283,107 @@ def _isolate_hermes_home(_hermetic_environment): return None +# ── Module-level state reset ─────────────────────────────────────────────── +# +# Python modules are singletons per process, and pytest-xdist workers are +# long-lived. Module-level dicts/sets (tool registries, approval state, +# interrupt flags) and ContextVars persist across tests in the same worker, +# causing tests that pass alone to fail when run with siblings. +# +# Each entry in this fixture clears state that belongs to a specific module. +# New state buckets go here too — this is the single gate that prevents +# "works alone, flakes in CI" bugs from state leakage. +# +# The skill `test-suite-cascade-diagnosis` documents the concrete patterns +# this closes; the running example was `test_command_guards` failing 12/15 +# CI runs because ``tools.approval._session_approved`` carried approvals +# from one test's session into another's. + +@pytest.fixture(autouse=True) +def _reset_module_state(): + """Clear module-level mutable state and ContextVars between tests. + + Keeps state from leaking across tests on the same xdist worker. Modules + that don't exist yet (test collection before production import) are + skipped silently — production import later creates fresh empty state. + """ + # --- tools.approval — the single biggest source of cross-test pollution --- + try: + from tools import approval as _approval_mod + _approval_mod._session_approved.clear() + _approval_mod._session_yolo.clear() + _approval_mod._permanent_approved.clear() + _approval_mod._pending.clear() + _approval_mod._gateway_queues.clear() + _approval_mod._gateway_notify_cbs.clear() + # ContextVar: reset to empty string so get_current_session_key() + # falls through to the env var / default path, matching a fresh + # process. + _approval_mod._approval_session_key.set("") + except Exception: + pass + + # --- tools.interrupt — per-thread interrupt flag set --- + try: + from tools import interrupt as _interrupt_mod + with _interrupt_mod._lock: + _interrupt_mod._interrupted_threads.clear() + except Exception: + pass + + # --- gateway.session_context — 9 ContextVars that represent + # the active gateway session. If set in one test and not reset, + # the next test's get_session_env() reads stale values. + try: + from gateway import session_context as _sc_mod + for _cv in ( + _sc_mod._SESSION_PLATFORM, + _sc_mod._SESSION_CHAT_ID, + _sc_mod._SESSION_CHAT_NAME, + _sc_mod._SESSION_THREAD_ID, + _sc_mod._SESSION_USER_ID, + _sc_mod._SESSION_USER_NAME, + _sc_mod._SESSION_KEY, + _sc_mod._CRON_AUTO_DELIVER_PLATFORM, + _sc_mod._CRON_AUTO_DELIVER_CHAT_ID, + _sc_mod._CRON_AUTO_DELIVER_THREAD_ID, + ): + _cv.set(_sc_mod._UNSET) + except Exception: + pass + + # --- tools.env_passthrough — ContextVar with no default --- + # LookupError is normal if the test never set it. Setting it to an + # empty set unconditionally normalizes the starting state. + try: + from tools import env_passthrough as _envp_mod + _envp_mod._allowed_env_vars_var.set(set()) + except Exception: + pass + + # --- tools.credential_files — ContextVar --- + try: + from tools import credential_files as _credf_mod + _credf_mod._registered_files_var.set({}) + except Exception: + pass + + # --- tools.file_tools — per-task read history + file-ops cache --- + # _read_tracker accumulates per-task_id read history for loop detection, + # capped by _READ_HISTORY_CAP. If entries from a prior test persist, the + # cap is hit faster than expected and capacity-related tests flake. + try: + from tools import file_tools as _ft_mod + with _ft_mod._read_tracker_lock: + _ft_mod._read_tracker.clear() + with _ft_mod._file_ops_lock: + _ft_mod._file_ops_cache.clear() + except Exception: + pass + + yield + + @pytest.fixture() def tmp_dir(tmp_path): """Provide a temporary directory that is cleaned up automatically.""" diff --git a/tests/gateway/test_internal_event_bypass_pairing.py b/tests/gateway/test_internal_event_bypass_pairing.py index d10195b2d..887884253 100644 --- a/tests/gateway/test_internal_event_bypass_pairing.py +++ b/tests/gateway/test_internal_event_bypass_pairing.py @@ -355,8 +355,17 @@ async def test_none_user_id_does_not_generate_pairing_code(monkeypatch, tmp_path async def test_non_internal_event_without_user_triggers_pairing(monkeypatch, tmp_path): """Verify the normal (non-internal) path still triggers pairing for unknown users.""" import gateway.run as gateway_run + import gateway.pairing as pairing_mod monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + # gateway.pairing.PAIRING_DIR is a module-level constant captured at + # import time from whichever HERMES_HOME was set then. Per-test + # HERMES_HOME redirection in conftest doesn't retroactively move it. + # Override directly so pairing rate-limit state lives in this test's + # tmp_path (and so stale state from prior xdist workers can't leak in). + pairing_dir = tmp_path / "pairing" + pairing_dir.mkdir() + monkeypatch.setattr(pairing_mod, "PAIRING_DIR", pairing_dir) (tmp_path / "config.yaml").write_text("", encoding="utf-8") # Clear env vars that could let all users through (loaded by diff --git a/tests/gateway/test_signal.py b/tests/gateway/test_signal.py index d7943b7f9..b51ec713f 100644 --- a/tests/gateway/test_signal.py +++ b/tests/gateway/test_signal.py @@ -306,7 +306,13 @@ class TestSignalSessionSource: class TestSignalPhoneRedaction: @pytest.fixture(autouse=True) def _ensure_redaction_enabled(self, monkeypatch): + # agent.redact snapshots _REDACT_ENABLED at import time from the + # HERMES_REDACT_SECRETS env var. monkeypatch.delenv is too late — + # the module was already imported during test collection with + # whatever value was in the env then. Force the flag directly. + # See skill: xdist-cross-test-pollution Pattern 5. monkeypatch.delenv("HERMES_REDACT_SECRETS", raising=False) + monkeypatch.setattr("agent.redact._REDACT_ENABLED", True) def test_us_number(self): from agent.redact import redact_sensitive_text