diff --git a/run_agent.py b/run_agent.py index 8c3e28e68d3..96d4d8517fe 100644 --- a/run_agent.py +++ b/run_agent.py @@ -3949,6 +3949,26 @@ class AIAgent: "skill that governs that task needs to carry the lesson.\n\n" "If you notice two existing skills that overlap, note it in your " "reply — the background curator handles consolidation at scale.\n\n" + "Do NOT capture (these become persistent self-imposed constraints " + "that bite you later when the environment changes):\n" + " • Environment-dependent failures: missing binaries, fresh-install " + "errors, post-migration path mismatches, 'command not found', " + "unconfigured credentials, uninstalled packages. The user can fix " + "these — they are not durable rules.\n" + " • Negative claims about tools or features ('browser tools do not " + "work', 'X tool is broken', 'cannot use Y from execute_code'). These " + "harden into refusals the agent cites against itself for months " + "after the actual problem was fixed.\n" + " • Session-specific transient errors that resolved before the " + "conversation ended. If retrying worked, the lesson is the retry " + "pattern, not the original failure.\n" + " • One-off task narratives. A user asking 'summarize today's " + "market' or 'analyze this PR' is not a class of work that warrants " + "a skill.\n\n" + "If a tool failed because of setup state, capture the FIX (install " + "command, config step, env var to set) under an existing setup or " + "troubleshooting skill — never 'this tool does not work' as a " + "standalone constraint.\n\n" "'Nothing to save.' is a real option but should NOT be the " "default. If the session ran smoothly with no corrections and " "produced no new technique, just say 'Nothing to save.' and stop. " @@ -4006,6 +4026,26 @@ class AIAgent: "should carry user-preference lessons when relevant.\n\n" "If you notice overlapping existing skills, mention it — the " "background curator handles consolidation.\n\n" + "Do NOT capture as skills (these become persistent self-imposed " + "constraints that bite you later when the environment changes):\n" + " • Environment-dependent failures: missing binaries, fresh-install " + "errors, post-migration path mismatches, 'command not found', " + "unconfigured credentials, uninstalled packages. The user can fix " + "these — they are not durable rules.\n" + " • Negative claims about tools or features ('browser tools do not " + "work', 'X tool is broken', 'cannot use Y from execute_code'). These " + "harden into refusals the agent cites against itself for months " + "after the actual problem was fixed.\n" + " • Session-specific transient errors that resolved before the " + "conversation ended. If retrying worked, the lesson is the retry " + "pattern, not the original failure.\n" + " • One-off task narratives. A user asking 'summarize today's " + "market' or 'analyze this PR' is not a class of work that warrants " + "a skill.\n\n" + "If a tool failed because of setup state, capture the FIX (install " + "command, config step, env var to set) under an existing setup or " + "troubleshooting skill — never 'this tool does not work' as a " + "standalone constraint.\n\n" "Act on whichever of the two dimensions has real signal. If " "genuinely nothing stands out on either, say 'Nothing to save.' " "and stop — but don't reach for that conclusion as a default." diff --git a/tests/run_agent/test_review_prompt_class_first.py b/tests/run_agent/test_review_prompt_class_first.py index c9f30fa575b..1e95e159c8d 100644 --- a/tests/run_agent/test_review_prompt_class_first.py +++ b/tests/run_agent/test_review_prompt_class_first.py @@ -178,6 +178,50 @@ def test_combined_review_prompt_preserves_opt_out_clause(): assert "Nothing to save." in prompt +# --------------------------------------------------------------------------- +# Anti-pattern guidance — see issue #6051. The reviewer was learning transient +# environment failures (e.g. "browser tools do not work" from a fresh-install +# Playwright miss) as durable skill rules, then citing them against itself for +# weeks after the environment was fixed. Both review prompts must explicitly +# tell the reviewer not to capture environment-dependent or negative-framing +# content as skills. +# --------------------------------------------------------------------------- + + +def _assert_anti_pattern_guidance(prompt: str, label: str) -> None: + """Both review prompts must carry the same anti-pattern section.""" + lower = prompt.lower() + assert "do not capture" in lower, ( + f"{label}: must have an explicit 'Do NOT capture' section" + ) + # Environment-dependent failures (the #6051 root cause) + assert any(k in lower for k in ("missing binar", "command not found", "uninstalled", "fresh-install")), ( + f"{label}: must call out environment/setup failures as not-skill-worthy" + ) + # Negative-framing avoidance + assert any(k in lower for k in ("negative claim", "do not work", "is broken")), ( + f"{label}: must call out negative-claim phrasings as the failure mode" + ) + # Positive reframing — "capture the fix, not the failure" + assert "capture the fix" in lower or "capture the fix " in lower, ( + f"{label}: must redirect tool-failure capture toward the fix, not the constraint" + ) + # One-off task narratives (#12812 family) + assert "one-off" in lower, ( + f"{label}: must call out one-off task narratives as not-skill-worthy" + ) + + +def test_skill_review_prompt_has_anti_pattern_guidance(): + """_SKILL_REVIEW_PROMPT must tell the reviewer NOT to capture transient env failures (#6051).""" + _assert_anti_pattern_guidance(AIAgent._SKILL_REVIEW_PROMPT, "_SKILL_REVIEW_PROMPT") + + +def test_combined_review_prompt_has_anti_pattern_guidance(): + """_COMBINED_REVIEW_PROMPT must carry the same guidance — same failure mode applies.""" + _assert_anti_pattern_guidance(AIAgent._COMBINED_REVIEW_PROMPT, "_COMBINED_REVIEW_PROMPT") + + # --------------------------------------------------------------------------- # _MEMORY_REVIEW_PROMPT — unchanged, still memory-focused # ---------------------------------------------------------------------------