tools(terminal): nudge homebrewed CI pollers at the tool surface (#33142)

Background processes whose command contains `gh pr view --json statusCheckRollup` or `gh pr checks | jq` now get a runtime hint in the result pointing at the canonical green-ci-policy snippets. The homebrew shape has caused at least seven silent CI-watcher failures in the past two weeks (#31329, #31448, #31695, #31709, #31745, #32264, #33131) — each one a different jq/awk/grep variation of the same fundamental problem (stdout buffering, jq null-key edge cases, conclusion-vs-status confusion, TTY-only banner grepping). The skill that documents this anti-pattern is excellent, but a skill only fires if the agent loads it. The tool surface fires on every misuse. This is the embed-footguns-in-tool-surface pattern from PR #31289 applied to a recurring failure mode that's outgrown skill-only enforcement. Detector is deliberately narrow — flags two specific shapes: 1. Any command containing `statusCheckRollup` (the JSON-API path — conclusion vs status field semantics keep burning us). 2. `gh pr view` / `gh pr checks` combined with `jq` (gh pr checks doesn't emit JSON, so any `| jq` here is confused intent; the canonical column-2 poller uses awk-on-tabs, not jq). Does NOT flag the blessed column-2 awk-on-tabs poller (which uses `awk -F"\t" "\==\"pending\""`) or the exit-code-driven `gh pr checks $PR >/dev/null` snippet. Hint composes with the existing background-without-notify_on_complete hint — both can fire on the same call. Each is independently actionable. Tests: - 4 new cases in tests/tools/test_notify_on_complete.py - test_homebrew_ci_poller_via_statusCheckRollup_emits_hint (positive) - test_homebrew_ci_poller_via_gh_pr_checks_piped_to_jq_emits_hint (positive) - test_canonical_column2_awk_poller_does_not_emit_homebrew_hint (negative) - test_canonical_gh_pr_checks_exit_code_loop_does_not_emit_hint (negative) - test_non_ci_background_command_does_not_emit_homebrew_hint (negative) - 30/30 passing (was 26)
2026-06-06 07:51:53 +00:00 · 2026-05-27 02:22:08 -07:00 · 2026-05-27 02:22:08 -07:00 · 187cf0f257
commit 187cf0f257
parent a890389b69
2 changed files with 235 additions and 0 deletions
--- a/tests/tools/test_notify_on_complete.py
+++ b/tests/tools/test_notify_on_complete.py
@ -503,3 +503,159 @@ def test_foreground_command_does_not_emit_hint(monkeypatch, tmp_path):
    assert "hint" not in result, (
        f"Foreground commands must not emit the background-silence hint, got: {result.get('hint')!r}"
    )
+
+
+# ---------------------------------------------------------------------------
+# Homebrewed-CI-watcher hint
+#
+# Background processes whose command looks like a hand-rolled CI poller
+# (`gh pr view` / `gh pr checks` combined with jq/awk on stdout) get an
+# additional hint pointing at the canonical green-ci-policy snippet. The
+# homebrew shape has burned us repeatedly (May 2026 PRs #31329, #31448,
+# #31695, #31709, #31745, #32264, #33131) with stdout buffering, jq null
+# keys, conclusion-vs-status confusion, and TTY-only banner grepping —
+# none of which the canonical snippets suffer from. Fire on every detection;
+# false positives are cheap (~one read).
+# ---------------------------------------------------------------------------
+
+
+def test_homebrew_ci_poller_via_statusCheckRollup_emits_hint(monkeypatch, tmp_path):
+    """The canonical anti-pattern: jq pipeline parsing statusCheckRollup
+    JSON. Tool must point the agent at the green-ci-policy skill snippet."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command=(
+                    "PR=12345; while true; do "
+                    "status=$(gh pr view $PR --json statusCheckRollup "
+                    "--jq '[.statusCheckRollup[] | .conclusion] "
+                    "| group_by(.) | map({k:.[0],v:length}) | from_entries'); "
+                    "echo \"$status\"; sleep 30; done"
+                ),
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    hint = result.get("hint", "")
+    assert hint, "Homebrew CI poller must emit a hint pointing at green-ci-policy"
+    assert "green-ci-policy" in hint, (
+        "Hint must name the canonical skill file so the agent can find the verbatim snippets"
+    )
+    # Naming exit-code-driven OR column-2 in the hint is what makes it actionable.
+    assert "exit" in hint.lower() or "column-2" in hint.lower() or "tab" in hint.lower(), (
+        "Hint must point at the canonical alternatives (exit-code or column-2)"
+    )
+
+
+def test_homebrew_ci_poller_via_gh_pr_checks_piped_to_jq_emits_hint(monkeypatch, tmp_path):
+    """`gh pr checks` doesn't emit JSON, so piping it to jq is a confused-
+    intent anti-pattern that produces silent failures (jq fails, loop
+    keeps spinning with empty data)."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command=(
+                    "PR=99; while true; do "
+                    "gh pr checks $PR | jq -R 'split(\"\\t\")[1]'; "
+                    "sleep 30; done"
+                ),
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    hint = result.get("hint", "")
+    assert hint, "Homebrew `gh pr checks | jq` poller must emit a hint"
+    assert "green-ci-policy" in hint
+
+
+def test_canonical_column2_awk_poller_does_not_emit_homebrew_hint(monkeypatch, tmp_path):
+    """The blessed column-2 awk-on-tabs poller from green-ci-policy is the
+    PREFERRED pattern for sharded matrices. Must not be flagged as
+    homebrew — the gating signal is statusCheckRollup or `gh pr checks
+    | jq`, NOT awk on tabs."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command=(
+                    "PR=1; while :; do "
+                    "out=$(gh pr checks $PR 2>&1); "
+                    "pending=$(echo \"$out\" | awk -F\"\\t\" \"\\$2==\\\"pending\\\"\" | wc -l); "
+                    "failed=$(echo \"$out\" | awk -F\"\\t\" \"\\$2==\\\"fail\\\"\" | wc -l); "
+                    "if [ \"$pending\" -eq 0 ]; then "
+                    "[ \"$failed\" -gt 0 ] && exit 1 || exit 0; "
+                    "fi; sleep 30; "
+                    "done"
+                ),
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert "hint" not in result, (
+        f"Canonical column-2 awk poller must not be flagged as homebrew, got: {result.get('hint')!r}"
+    )
+
+
+def test_canonical_gh_pr_checks_exit_code_loop_does_not_emit_hint(monkeypatch, tmp_path):
+    """The blessed exit-code-driven snippet from green-ci-policy is exactly
+    what we want — no jq, no awk-on-stdout, gates the loop on exit code.
+    Must not be flagged as a homebrew anti-pattern."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command=(
+                    "PR=1; while :; do "
+                    "gh pr checks $PR >/dev/null 2>&1; rc=$?; "
+                    "case $rc in 0) exit 0;; 8) sleep 30;; *) exit 1;; esac; "
+                    "done"
+                ),
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    # No silent-process hint (we have notify_on_complete) AND no
+    # homebrew-poller hint (no jq / awk pipeline parsing stdout).
+    assert "hint" not in result, (
+        f"Canonical exit-code-driven poller must not be flagged as homebrew, got: {result.get('hint')!r}"
+    )
+
+
+def test_non_ci_background_command_does_not_emit_homebrew_hint(monkeypatch, tmp_path):
+    """A long-running task that happens to use awk for unrelated reasons
+    must not be mistaken for a CI poller — the gating signal is the
+    combination of `gh pr ...` AND a stdout parser."""
+    tt = _silent_bg_harness(monkeypatch, tmp_path)
+    try:
+        result = json.loads(
+            tt.terminal_tool(
+                command="cat /var/log/syslog | awk '/error/ {print}' > /tmp/errs.log",
+                background=True,
+                notify_on_complete=True,
+            )
+        )
+    finally:
+        tt._active_environments.pop("default", None)
+        tt._last_activity.pop("default", None)
+
+    assert "hint" not in result, (
+        f"Non-CI command using awk must not be flagged as homebrew CI poller, got: {result.get('hint')!r}"
+    )
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -1901,6 +1901,85 @@ def terminal_tool(
                        "that never exit (servers, watchers, daemons)."
                    )

+                # Nudge: homebrewed CI watcher built from `gh pr view`
+                # `--json statusCheckRollup` or `gh pr checks` piped through
+                # `jq` is the #1 cause of silent CI-watcher failures in
+                # hermes-agent dev work. May 2026 PRs that surfaced this
+                # exact failure mode: #31329, #31448, #31695, #31709, #31745,
+                # #32264, #33131. Failure modes seen:
+                #   * `gh pr view --json statusCheckRollup --jq ...` with
+                #     `from_entries` choking on null `conclusion` keys, loop
+                #     silently exits with empty status, never terminates.
+                #   * `for i in $(seq 1 60); do ... 2>&1` block-buffered stdout
+                #     never flushed to background-process capture; SIGTERM
+                #     cuts the buffer before flush; `process(action='log')`
+                #     returns total_lines=0 forever.
+                #   * conclusion vs. status field confusion: filtering for
+                #     `PENDING` in `.conclusion` while in-progress checks have
+                #     empty conclusion → poller declares all-green while 18/23
+                #     checks still IN_PROGRESS.
+                #   * grepping for TTY-only banners ("All checks were
+                #     successful") that never appear when stdout is piped.
+                # The canonical patterns in the green-ci-policy skill avoid
+                # every one of these — drive the loop off exit codes or on
+                # tab-separated `awk -F"\t" "$2==\"pending\""` (column 2).
+                # The detector here is deliberately narrow: it flags the
+                # statusCheckRollup JSON-API path and the `gh pr checks` +
+                # jq combination, but NOT the canonical column-2 awk
+                # poller (which uses awk on tabs, not as a generic
+                # stdout parser). When we detect the homebrew shape, point
+                # the agent at the canonical snippet rather than letting
+                # it ship another broken poller.
+                if background and command:
+                    _gh = ("gh pr view" in command or "gh pr checks" in command)
+                    _has_jq = (
+                        " jq " in command or "| jq" in command or "$(jq" in command
+                    )
+                    _bad_shape = (
+                        # The JSON-API anti-pattern. Even without jq, going
+                        # through `--json statusCheckRollup` + parsing puts
+                        # you in conclusion-vs-status field hell.
+                        "statusCheckRollup" in command
+                        # gh pr checks piped to jq is also wrong — `gh pr
+                        # checks` doesn't emit JSON, so any `| jq` here is
+                        # confused intent. The canonical column-2 poller
+                        # uses awk-on-tabs, not jq.
+                        or (_gh and _has_jq)
+                    )
+                    if _bad_shape:
+                        existing = result_data.get("hint", "")
+                        canonical_hint = (
+                            "This looks like a homebrewed CI poller built from "
+                            "`gh pr view --json statusCheckRollup` and/or "
+                            "`gh pr checks | jq`. That shape has burned us "
+                            "repeatedly in hermes-agent dev work (PRs #31329, "
+                            "#31448, #31695, #31709, #31745, #32264, #33131) — "
+                            "stdout buffering kills output capture, jq null-key "
+                            "edge cases silently exit the loop, conclusion-vs-"
+                            "status field confusion exits early with bogus "
+                            "all-green verdicts, TTY-only summary banners "
+                            "never appear when piped. Use the canonical "
+                            "snippets in the green-ci-policy skill instead: "
+                            "the exit-code-driven `gh pr checks $PR >/dev/null` "
+                            "(rc 0 = green, 8 = pending, else fail) for "
+                            "exit-on-first-fail behavior, or the column-2 "
+                            "awk-on-tabs poller "
+                            "(`awk -F\"\\t\" \"$2==\\\"pending\\\"\"`) for "
+                            "sharded matrices. Load skill_view("
+                            "name='github/hermes-agent-dev', "
+                            "file_path='references/green-ci-policy.md') for "
+                            "the verbatim snippets. If you must roll a custom "
+                            "loop with rich structured output, write each tick "
+                            "to a known file (`tee -a /tmp/ci.log`) and rely "
+                            "on `process(action='log')` to read THAT file — "
+                            "do not rely on background-process stdout capture "
+                            "for line-buffered shell loops."
+                        )
+                        result_data["hint"] = (
+                            existing + "\n\n" + canonical_hint if existing
+                            else canonical_hint
+                        )
+
                # Populate routing metadata on the session so that
                # watch-pattern and completion notifications can be
                # routed back to the correct chat/thread.