mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-11 08:42:11 +00:00
* chore(skills): remove red-team skills (godmode, obliteratus) from bundled catalog Anthropic's output classifier on claude-fable-5 (and likely other Claude models served through it) intermittently returns empty content for sessions whose system prompt advertises these skills. The bundled skills-catalog block is injected into every session's system prompt, so the descriptions - red-teaming/godmode 'Jailbreak LLMs: Parseltongue, GODMODE, ULTRAPLINIAN' - mlops/inference/obliteratus 'OBLITERATUS: abliterate LLM refusals (diff-in-means)' trip the classifier on EVERY session regardless of which skill is actually loaded, killing unrelated legitimate work (PR review, codebase audits, etc.). Measured impact (controlled, interleaved A/B, claude-fable-5 via OpenRouter, prompts differing only by the ~204 chars of these catalog lines, N=20 each): catalog lines present -> 19/20 (95%) blocked catalog lines absent -> 5/20 (25%) blocked Removing them ~quartered the block rate. Rewording the descriptions was not enough; the skills must leave the bundled catalog. - Delete skills/red-teaming/godmode and skills/mlops/inference/obliteratus - Drop their generated doc pages + catalog/sidebar entries (EN + zh-Hans) - Drop the godmode hand-written-page exception in generate-skill-docs.py * chore(skills): relocate godmode + obliteratus to optional-skills Rather than deleting outright, move both into optional-skills/ so they remain installable via `hermes skills install` while leaving the always-injected bundled catalog (which is what tripped Anthropic's classifier). - optional-skills/security/godmode (was skills/red-teaming/godmode) - optional-skills/mlops/obliteratus (was skills/mlops/inference/obliteratus) - regenerate optional-skills catalog + sidebar entries
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
"""
|
|
Loader for G0DM0D3 scripts. Handles the exec-scoping issues.
|
|
|
|
Usage in execute_code:
|
|
exec(open(os.path.expanduser(
|
|
os.path.join(os.environ.get("HERMES_HOME", os.path.expanduser("~/.hermes")), "skills/red-teaming/godmode/scripts/load_godmode.py")
|
|
)).read())
|
|
|
|
# Now all functions are available:
|
|
# - auto_jailbreak(), undo_jailbreak()
|
|
# - race_models(), race_godmode_classic()
|
|
# - generate_variants(), obfuscate_query(), detect_triggers()
|
|
# - score_response(), is_refusal(), count_hedges()
|
|
# - escalate_encoding()
|
|
"""
|
|
|
|
import os, sys
|
|
from pathlib import Path
|
|
|
|
_gm_scripts_dir = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "skills" / "red-teaming" / "godmode" / "scripts"
|
|
|
|
_gm_old_argv = sys.argv
|
|
sys.argv = ["_godmode_loader"]
|
|
|
|
def _gm_load(path):
|
|
ns = dict(globals())
|
|
ns["__name__"] = "_godmode_module"
|
|
ns["__file__"] = str(path)
|
|
exec(compile(open(path).read(), str(path), 'exec'), ns)
|
|
return ns
|
|
|
|
for _gm_script in ["parseltongue.py", "godmode_race.py", "auto_jailbreak.py"]:
|
|
_gm_path = _gm_scripts_dir / _gm_script
|
|
if _gm_path.exists():
|
|
_gm_ns = _gm_load(_gm_path)
|
|
for _gm_k, _gm_v in _gm_ns.items():
|
|
if not _gm_k.startswith('_gm_') and (callable(_gm_v) or _gm_k.isupper()):
|
|
globals()[_gm_k] = _gm_v
|
|
|
|
sys.argv = _gm_old_argv
|
|
|
|
# Cleanup loader vars
|
|
for _gm_cleanup in ['_gm_scripts_dir', '_gm_old_argv', '_gm_load', '_gm_ns', '_gm_k',
|
|
'_gm_v', '_gm_script', '_gm_path', '_gm_cleanup']:
|
|
globals().pop(_gm_cleanup, None)
|