mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
Closes #6051. Reported failure mode: agent migrated to WSL2, browser launch failed because Playwright wasn't installed yet. Background reviewer captured the failure as a durable skill (`browser-tool-launch-issue`) and the agent kept refusing the browser tool for weeks after Playwright was installed and verified working. Negative claims also propagated into unrelated skills ("browser tools do not work", "cannot use Y from execute_code"). Root cause: `_SKILL_REVIEW_PROMPT` and `_COMBINED_REVIEW_PROMPT` both lean hard on "be active, save things, a pass that does nothing is a missed learning opportunity." Neither distinguished durable knowledge from transient environment state. The reviewer was doing what it was told. Fix at the write site — both prompts now carry a "Do NOT capture" section calling out: • Environment-dependent failures (missing binaries, fresh-install errors, post-migration path mismatches, 'command not found', unconfigured credentials, uninstalled packages) • Negative claims about tools or features ("X does not work") that harden into self-cited refusals • Session-specific transient errors that resolved before the conversation ended • One-off task narratives ("summarize today's market", "analyze this PR") — also addresses the #12812 / #4538 family Plus a positive-reframing line: when a tool fails because of setup state, capture the FIX (install command, config step, env var) under an existing setup/troubleshooting skill — never "this tool doesn't work" as a standalone constraint. Targeted tests: 24/24 passing in tests/run_agent/test_review_prompt_class_first.py (2 new + all existing review-prompt assertions). Substring-based checks so future prompt edits don't false-fail.
235 lines
10 KiB
Python
235 lines
10 KiB
Python
"""Behavior tests for the skill review / combined review prompts.
|
|
|
|
The review prompts steer the background review agent toward actively updating
|
|
the skill library after most sessions, with a strong bias toward:
|
|
1. Patching currently-loaded skills first,
|
|
2. Patching existing umbrellas next,
|
|
3. Adding references/ files under an existing umbrella,
|
|
4. Creating a new class-level umbrella only when nothing else fits.
|
|
|
|
User-preference corrections (style, format, verbosity, legibility) are
|
|
first-class skill signals, not just memory signals.
|
|
|
|
These tests assert behavioral *instructions* are present — they do NOT
|
|
snapshot the full prompt text (change-detector).
|
|
"""
|
|
|
|
from run_agent import AIAgent
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _SKILL_REVIEW_PROMPT
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_skill_review_prompt_biases_toward_active_updates():
|
|
"""Prompt must frame updating as the default stance, not something rare."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "ACTIVE" in prompt or "active" in prompt.lower(), (
|
|
"must tell the reviewer to be active"
|
|
)
|
|
# "missed learning opportunity" or equivalent framing for not acting
|
|
assert "missed" in prompt.lower() or "opportunity" in prompt.lower(), (
|
|
"must frame inaction as a miss, not a neutral outcome"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_treats_user_corrections_as_skill_signal():
|
|
"""Style/format/verbosity complaints must be FIRST-CLASS skill signals, not just memory."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
lower = prompt.lower()
|
|
# Must mention style/format/verbosity-family corrections
|
|
assert any(k in lower for k in ("style", "format", "verbos", "legib", "tone")), (
|
|
"must name style/format/verbosity/legibility as signals"
|
|
)
|
|
# Must frame these as first-class skill signals (not memory-only)
|
|
assert "FIRST-CLASS" in prompt or "first-class" in prompt, (
|
|
"must explicitly label user-preference corrections as first-class skill signals"
|
|
)
|
|
# Must mention the correction-type phrases to tune the model's ear
|
|
assert "stop doing" in lower or "don't" in lower or "hate" in lower or "frustrat" in lower, (
|
|
"must give concrete phrasing examples so the model recognizes corrections"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_prefers_loaded_skills_first():
|
|
"""Currently-loaded skills must be the first patch target."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "LOADED" in prompt or "loaded" in prompt, (
|
|
"must mention currently-loaded skills"
|
|
)
|
|
# Must name the mechanisms for detecting loaded skills
|
|
assert "skill_view" in prompt and "/skill" in prompt, (
|
|
"must name skill_view and /skill-name as loaded-skill signals"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_has_four_step_preference_order():
|
|
"""The 4-step patch/support-file/create ladder must be present."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "PATCH" in prompt
|
|
assert "references/" in prompt or "REFERENCE" in prompt
|
|
assert "CREATE" in prompt
|
|
assert "UMBRELLA" in prompt or "umbrella" in prompt
|
|
|
|
|
|
def test_skill_review_prompt_names_three_support_file_kinds():
|
|
"""Support-file step must name references/, templates/, and scripts/."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "references/" in prompt, "must name references/ as a support-file kind"
|
|
assert "templates/" in prompt, "must name templates/ as a support-file kind"
|
|
assert "scripts/" in prompt, "must name scripts/ as a support-file kind"
|
|
# Purpose hints for each kind
|
|
assert "knowledge" in prompt.lower() or "research" in prompt.lower() or "API docs" in prompt, (
|
|
"must mention knowledge-bank / research / API-docs role of references/"
|
|
)
|
|
assert "copied" in prompt.lower() or "starter" in prompt.lower() or "reproduce" in prompt.lower(), (
|
|
"must mention that templates/ are starter files to copy/modify"
|
|
)
|
|
assert "re-runnable" in prompt.lower() or "verification" in prompt.lower() or "probe" in prompt.lower(), (
|
|
"must mention that scripts/ are re-runnable actions"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_has_name_veto_for_create():
|
|
"""Creating a new skill must be gated behind class-level naming."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "class level" in prompt.lower() or "CLASS-LEVEL" in prompt
|
|
assert "MUST NOT" in prompt or "must not" in prompt, (
|
|
"must have a name-veto clause blocking session-artifact names"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_embeds_user_preferences_in_skills():
|
|
"""Must explicitly say user-preference lessons belong in SKILL.md, not only memory."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
lower = prompt.lower()
|
|
assert "preference" in lower, "must mention user preferences"
|
|
assert "memory" in lower and "skill" in lower, (
|
|
"must contrast memory vs skill responsibilities"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_flags_overlap_and_defers_to_curator():
|
|
"""Reviewer should not consolidate live; flag overlap for the curator."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "overlap" in prompt.lower()
|
|
assert "curator" in prompt.lower(), "must defer consolidation to the curator"
|
|
|
|
|
|
def test_skill_review_prompt_still_has_opt_out_clause():
|
|
"""'Nothing to save.' must remain as a real-but-not-default option."""
|
|
prompt = AIAgent._SKILL_REVIEW_PROMPT
|
|
assert "Nothing to save." in prompt
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _COMBINED_REVIEW_PROMPT
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_combined_review_prompt_has_memory_section():
|
|
"""Memory half must still cover user facts and preferences."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "**Memory**" in prompt
|
|
assert "memory tool" in prompt
|
|
|
|
|
|
def test_combined_review_prompt_skills_biased_toward_active_updates():
|
|
"""Skills half must carry the active-update bias."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "**Skills**" in prompt
|
|
assert "ACTIVE" in prompt or "active" in prompt.lower()
|
|
assert "missed" in prompt.lower() or "opportunity" in prompt.lower()
|
|
|
|
|
|
def test_combined_review_prompt_treats_user_corrections_as_skill_signal():
|
|
"""Combined prompt must carry the same user-preference-is-skill-signal rule."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
lower = prompt.lower()
|
|
assert any(k in lower for k in ("style", "format", "verbos", "legib", "tone"))
|
|
assert "FIRST-CLASS" in prompt or "first-class" in prompt
|
|
|
|
|
|
def test_combined_review_prompt_prefers_loaded_skills_first():
|
|
"""Combined prompt must also prefer loaded skills first."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "LOADED" in prompt or "loaded" in prompt
|
|
assert "skill_view" in prompt and "/skill" in prompt
|
|
|
|
|
|
def test_combined_review_prompt_has_four_step_skill_ladder():
|
|
"""Combined prompt must keep the patch/support-file/create ladder on the Skills half."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "PATCH" in prompt
|
|
assert "references/" in prompt or "REFERENCE" in prompt
|
|
assert "CREATE" in prompt
|
|
assert "CLASS-LEVEL" in prompt or "class-level" in prompt or "class level" in prompt.lower()
|
|
|
|
|
|
def test_combined_review_prompt_names_three_support_file_kinds():
|
|
"""Combined prompt must also name all three support-file kinds."""
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "references/" in prompt
|
|
assert "templates/" in prompt
|
|
assert "scripts/" in prompt
|
|
|
|
|
|
def test_combined_review_prompt_preserves_opt_out_clause():
|
|
prompt = AIAgent._COMBINED_REVIEW_PROMPT
|
|
assert "Nothing to save." in prompt
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anti-pattern guidance — see issue #6051. The reviewer was learning transient
|
|
# environment failures (e.g. "browser tools do not work" from a fresh-install
|
|
# Playwright miss) as durable skill rules, then citing them against itself for
|
|
# weeks after the environment was fixed. Both review prompts must explicitly
|
|
# tell the reviewer not to capture environment-dependent or negative-framing
|
|
# content as skills.
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _assert_anti_pattern_guidance(prompt: str, label: str) -> None:
|
|
"""Both review prompts must carry the same anti-pattern section."""
|
|
lower = prompt.lower()
|
|
assert "do not capture" in lower, (
|
|
f"{label}: must have an explicit 'Do NOT capture' section"
|
|
)
|
|
# Environment-dependent failures (the #6051 root cause)
|
|
assert any(k in lower for k in ("missing binar", "command not found", "uninstalled", "fresh-install")), (
|
|
f"{label}: must call out environment/setup failures as not-skill-worthy"
|
|
)
|
|
# Negative-framing avoidance
|
|
assert any(k in lower for k in ("negative claim", "do not work", "is broken")), (
|
|
f"{label}: must call out negative-claim phrasings as the failure mode"
|
|
)
|
|
# Positive reframing — "capture the fix, not the failure"
|
|
assert "capture the fix" in lower or "capture the fix " in lower, (
|
|
f"{label}: must redirect tool-failure capture toward the fix, not the constraint"
|
|
)
|
|
# One-off task narratives (#12812 family)
|
|
assert "one-off" in lower, (
|
|
f"{label}: must call out one-off task narratives as not-skill-worthy"
|
|
)
|
|
|
|
|
|
def test_skill_review_prompt_has_anti_pattern_guidance():
|
|
"""_SKILL_REVIEW_PROMPT must tell the reviewer NOT to capture transient env failures (#6051)."""
|
|
_assert_anti_pattern_guidance(AIAgent._SKILL_REVIEW_PROMPT, "_SKILL_REVIEW_PROMPT")
|
|
|
|
|
|
def test_combined_review_prompt_has_anti_pattern_guidance():
|
|
"""_COMBINED_REVIEW_PROMPT must carry the same guidance — same failure mode applies."""
|
|
_assert_anti_pattern_guidance(AIAgent._COMBINED_REVIEW_PROMPT, "_COMBINED_REVIEW_PROMPT")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# _MEMORY_REVIEW_PROMPT — unchanged, still memory-focused
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_memory_review_prompt_still_focused_on_user_facts():
|
|
"""Memory-only review prompt stays focused on user facts — not touched by this change."""
|
|
prompt = AIAgent._MEMORY_REVIEW_PROMPT
|
|
# The memory-only prompt should NOT drift into skill territory
|
|
assert "skills_list" not in prompt
|
|
assert "SURVEY" not in prompt
|
|
assert "memory tool" in prompt
|