tools(terminal): nudge homebrewed CI pollers at the tool surface (#33142)

Background processes whose command contains `gh pr view --json
statusCheckRollup` or `gh pr checks | jq` now get a runtime hint in
the result pointing at the canonical green-ci-policy snippets. The
homebrew shape has caused at least seven silent CI-watcher failures
in the past two weeks (#31329, #31448, #31695, #31709, #31745,
#32264, #33131) — each one a different jq/awk/grep variation of the
same fundamental problem (stdout buffering, jq null-key edge cases,
conclusion-vs-status confusion, TTY-only banner grepping).

The skill that documents this anti-pattern is excellent, but a skill
only fires if the agent loads it. The tool surface fires on every
misuse. This is the embed-footguns-in-tool-surface pattern from
PR #31289 applied to a recurring failure mode that's outgrown
skill-only enforcement.

Detector is deliberately narrow — flags two specific shapes:

  1. Any command containing `statusCheckRollup` (the JSON-API path —
     conclusion vs status field semantics keep burning us).
  2. `gh pr view` / `gh pr checks` combined with `jq` (gh pr
     checks doesn't emit JSON, so any `| jq` here is confused intent;
     the canonical column-2 poller uses awk-on-tabs, not jq).

Does NOT flag the blessed column-2 awk-on-tabs poller (which uses
`awk -F"\t" "\==\"pending\""`) or the exit-code-driven
`gh pr checks $PR >/dev/null` snippet.

Hint composes with the existing background-without-notify_on_complete
hint — both can fire on the same call. Each is independently
actionable.

Tests:
- 4 new cases in tests/tools/test_notify_on_complete.py
- test_homebrew_ci_poller_via_statusCheckRollup_emits_hint (positive)
- test_homebrew_ci_poller_via_gh_pr_checks_piped_to_jq_emits_hint (positive)
- test_canonical_column2_awk_poller_does_not_emit_homebrew_hint (negative)
- test_canonical_gh_pr_checks_exit_code_loop_does_not_emit_hint (negative)
- test_non_ci_background_command_does_not_emit_homebrew_hint (negative)
- 30/30 passing (was 26)
This commit is contained in:
Teknium 2026-05-27 02:22:08 -07:00 committed by GitHub
parent a890389b69
commit 187cf0f257
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 235 additions and 0 deletions

View file

@ -503,3 +503,159 @@ def test_foreground_command_does_not_emit_hint(monkeypatch, tmp_path):
assert "hint" not in result, (
f"Foreground commands must not emit the background-silence hint, got: {result.get('hint')!r}"
)
# ---------------------------------------------------------------------------
# Homebrewed-CI-watcher hint
#
# Background processes whose command looks like a hand-rolled CI poller
# (`gh pr view` / `gh pr checks` combined with jq/awk on stdout) get an
# additional hint pointing at the canonical green-ci-policy snippet. The
# homebrew shape has burned us repeatedly (May 2026 PRs #31329, #31448,
# #31695, #31709, #31745, #32264, #33131) with stdout buffering, jq null
# keys, conclusion-vs-status confusion, and TTY-only banner grepping —
# none of which the canonical snippets suffer from. Fire on every detection;
# false positives are cheap (~one read).
# ---------------------------------------------------------------------------
def test_homebrew_ci_poller_via_statusCheckRollup_emits_hint(monkeypatch, tmp_path):
"""The canonical anti-pattern: jq pipeline parsing statusCheckRollup
JSON. Tool must point the agent at the green-ci-policy skill snippet."""
tt = _silent_bg_harness(monkeypatch, tmp_path)
try:
result = json.loads(
tt.terminal_tool(
command=(
"PR=12345; while true; do "
"status=$(gh pr view $PR --json statusCheckRollup "
"--jq '[.statusCheckRollup[] | .conclusion] "
"| group_by(.) | map({k:.[0],v:length}) | from_entries'); "
"echo \"$status\"; sleep 30; done"
),
background=True,
notify_on_complete=True,
)
)
finally:
tt._active_environments.pop("default", None)
tt._last_activity.pop("default", None)
hint = result.get("hint", "")
assert hint, "Homebrew CI poller must emit a hint pointing at green-ci-policy"
assert "green-ci-policy" in hint, (
"Hint must name the canonical skill file so the agent can find the verbatim snippets"
)
# Naming exit-code-driven OR column-2 in the hint is what makes it actionable.
assert "exit" in hint.lower() or "column-2" in hint.lower() or "tab" in hint.lower(), (
"Hint must point at the canonical alternatives (exit-code or column-2)"
)
def test_homebrew_ci_poller_via_gh_pr_checks_piped_to_jq_emits_hint(monkeypatch, tmp_path):
"""`gh pr checks` doesn't emit JSON, so piping it to jq is a confused-
intent anti-pattern that produces silent failures (jq fails, loop
keeps spinning with empty data)."""
tt = _silent_bg_harness(monkeypatch, tmp_path)
try:
result = json.loads(
tt.terminal_tool(
command=(
"PR=99; while true; do "
"gh pr checks $PR | jq -R 'split(\"\\t\")[1]'; "
"sleep 30; done"
),
background=True,
notify_on_complete=True,
)
)
finally:
tt._active_environments.pop("default", None)
tt._last_activity.pop("default", None)
hint = result.get("hint", "")
assert hint, "Homebrew `gh pr checks | jq` poller must emit a hint"
assert "green-ci-policy" in hint
def test_canonical_column2_awk_poller_does_not_emit_homebrew_hint(monkeypatch, tmp_path):
"""The blessed column-2 awk-on-tabs poller from green-ci-policy is the
PREFERRED pattern for sharded matrices. Must not be flagged as
homebrew the gating signal is statusCheckRollup or `gh pr checks
| jq`, NOT awk on tabs."""
tt = _silent_bg_harness(monkeypatch, tmp_path)
try:
result = json.loads(
tt.terminal_tool(
command=(
"PR=1; while :; do "
"out=$(gh pr checks $PR 2>&1); "
"pending=$(echo \"$out\" | awk -F\"\\t\" \"\\$2==\\\"pending\\\"\" | wc -l); "
"failed=$(echo \"$out\" | awk -F\"\\t\" \"\\$2==\\\"fail\\\"\" | wc -l); "
"if [ \"$pending\" -eq 0 ]; then "
"[ \"$failed\" -gt 0 ] && exit 1 || exit 0; "
"fi; sleep 30; "
"done"
),
background=True,
notify_on_complete=True,
)
)
finally:
tt._active_environments.pop("default", None)
tt._last_activity.pop("default", None)
assert "hint" not in result, (
f"Canonical column-2 awk poller must not be flagged as homebrew, got: {result.get('hint')!r}"
)
def test_canonical_gh_pr_checks_exit_code_loop_does_not_emit_hint(monkeypatch, tmp_path):
"""The blessed exit-code-driven snippet from green-ci-policy is exactly
what we want no jq, no awk-on-stdout, gates the loop on exit code.
Must not be flagged as a homebrew anti-pattern."""
tt = _silent_bg_harness(monkeypatch, tmp_path)
try:
result = json.loads(
tt.terminal_tool(
command=(
"PR=1; while :; do "
"gh pr checks $PR >/dev/null 2>&1; rc=$?; "
"case $rc in 0) exit 0;; 8) sleep 30;; *) exit 1;; esac; "
"done"
),
background=True,
notify_on_complete=True,
)
)
finally:
tt._active_environments.pop("default", None)
tt._last_activity.pop("default", None)
# No silent-process hint (we have notify_on_complete) AND no
# homebrew-poller hint (no jq / awk pipeline parsing stdout).
assert "hint" not in result, (
f"Canonical exit-code-driven poller must not be flagged as homebrew, got: {result.get('hint')!r}"
)
def test_non_ci_background_command_does_not_emit_homebrew_hint(monkeypatch, tmp_path):
"""A long-running task that happens to use awk for unrelated reasons
must not be mistaken for a CI poller the gating signal is the
combination of `gh pr ...` AND a stdout parser."""
tt = _silent_bg_harness(monkeypatch, tmp_path)
try:
result = json.loads(
tt.terminal_tool(
command="cat /var/log/syslog | awk '/error/ {print}' > /tmp/errs.log",
background=True,
notify_on_complete=True,
)
)
finally:
tt._active_environments.pop("default", None)
tt._last_activity.pop("default", None)
assert "hint" not in result, (
f"Non-CI command using awk must not be flagged as homebrew CI poller, got: {result.get('hint')!r}"
)

View file

@ -1901,6 +1901,85 @@ def terminal_tool(
"that never exit (servers, watchers, daemons)."
)
# Nudge: homebrewed CI watcher built from `gh pr view`
# `--json statusCheckRollup` or `gh pr checks` piped through
# `jq` is the #1 cause of silent CI-watcher failures in
# hermes-agent dev work. May 2026 PRs that surfaced this
# exact failure mode: #31329, #31448, #31695, #31709, #31745,
# #32264, #33131. Failure modes seen:
# * `gh pr view --json statusCheckRollup --jq ...` with
# `from_entries` choking on null `conclusion` keys, loop
# silently exits with empty status, never terminates.
# * `for i in $(seq 1 60); do ... 2>&1` block-buffered stdout
# never flushed to background-process capture; SIGTERM
# cuts the buffer before flush; `process(action='log')`
# returns total_lines=0 forever.
# * conclusion vs. status field confusion: filtering for
# `PENDING` in `.conclusion` while in-progress checks have
# empty conclusion → poller declares all-green while 18/23
# checks still IN_PROGRESS.
# * grepping for TTY-only banners ("All checks were
# successful") that never appear when stdout is piped.
# The canonical patterns in the green-ci-policy skill avoid
# every one of these — drive the loop off exit codes or on
# tab-separated `awk -F"\t" "$2==\"pending\""` (column 2).
# The detector here is deliberately narrow: it flags the
# statusCheckRollup JSON-API path and the `gh pr checks` +
# jq combination, but NOT the canonical column-2 awk
# poller (which uses awk on tabs, not as a generic
# stdout parser). When we detect the homebrew shape, point
# the agent at the canonical snippet rather than letting
# it ship another broken poller.
if background and command:
_gh = ("gh pr view" in command or "gh pr checks" in command)
_has_jq = (
" jq " in command or "| jq" in command or "$(jq" in command
)
_bad_shape = (
# The JSON-API anti-pattern. Even without jq, going
# through `--json statusCheckRollup` + parsing puts
# you in conclusion-vs-status field hell.
"statusCheckRollup" in command
# gh pr checks piped to jq is also wrong — `gh pr
# checks` doesn't emit JSON, so any `| jq` here is
# confused intent. The canonical column-2 poller
# uses awk-on-tabs, not jq.
or (_gh and _has_jq)
)
if _bad_shape:
existing = result_data.get("hint", "")
canonical_hint = (
"This looks like a homebrewed CI poller built from "
"`gh pr view --json statusCheckRollup` and/or "
"`gh pr checks | jq`. That shape has burned us "
"repeatedly in hermes-agent dev work (PRs #31329, "
"#31448, #31695, #31709, #31745, #32264, #33131) — "
"stdout buffering kills output capture, jq null-key "
"edge cases silently exit the loop, conclusion-vs-"
"status field confusion exits early with bogus "
"all-green verdicts, TTY-only summary banners "
"never appear when piped. Use the canonical "
"snippets in the green-ci-policy skill instead: "
"the exit-code-driven `gh pr checks $PR >/dev/null` "
"(rc 0 = green, 8 = pending, else fail) for "
"exit-on-first-fail behavior, or the column-2 "
"awk-on-tabs poller "
"(`awk -F\"\\t\" \"$2==\\\"pending\\\"\"`) for "
"sharded matrices. Load skill_view("
"name='github/hermes-agent-dev', "
"file_path='references/green-ci-policy.md') for "
"the verbatim snippets. If you must roll a custom "
"loop with rich structured output, write each tick "
"to a known file (`tee -a /tmp/ci.log`) and rely "
"on `process(action='log')` to read THAT file — "
"do not rely on background-process stdout capture "
"for line-buffered shell loops."
)
result_data["hint"] = (
existing + "\n\n" + canonical_hint if existing
else canonical_hint
)
# Populate routing metadata on the session so that
# watch-pattern and completion notifications can be
# routed back to the correct chat/thread.