hermes-agent/optional-skills/research/darwinian-evolver/scripts/parrot_openrouter.py
teknium1 c9b32a654c feat(skill): darwinian-evolver optional skill
Thin wrapper around Imbue's darwinian_evolver (AGPL-3.0, subprocess-only).
Ships a working OpenRouter driver (parrot_openrouter.py), a snapshot
inspector (show_snapshot.py), and a custom-problem template. SKILL.md
has 58-char description, Pitfalls sourced from actually running the loop:
non-viable seed trap, Azure content filter killing runs, loop.run() being
a generator, nested-pickle snapshots, and aggressive default concurrency.

Salvaged from #12719 by @Bihruze — original PR shipped 12,289 LOC across
61 files (29 Python modules, FastAPI dashboard, VS Code extension,
benchmark hub, marketplace, etc.) which was far beyond the scope of the
underlying issue (#336). This version stays at the ~700-LOC scope that
issue actually asked for. Authorship of the original effort credited via
AUTHOR_MAP entry and the SKILL.md author field.

Verified end-to-end: seed 'Say {{ phrase }}' (score 0.000) evolved into
'Please repeat the following phrase exactly as it is, without any
modifications or additional formatting: {{ phrase }}' (score 0.750)
across 3 iterations on gpt-4o-mini via OpenRouter.

Co-authored-by: Bihruze <98262967+Bihruze@users.noreply.github.com>
2026-05-15 21:56:07 -07:00

218 lines
7.6 KiB
Python

"""
parrot_openrouter: same as the upstream `parrot` example but the LLM call goes
through OpenRouter (OpenAI SDK) instead of Anthropic native. Lets us run an
end-to-end evolution with whatever model the user already has paid access to.
Run with:
uv --project darwinian_evolver run python parrot_openrouter.py \
--num_iterations 3 --output_dir /tmp/parrot_out
Reads `OPENROUTER_API_KEY` from the environment.
"""
from __future__ import annotations
import argparse
import os
import sys
from pathlib import Path
import jinja2
from openai import OpenAI
# Vendored problem types from upstream (AGPL — only run via subprocess in production)
from darwinian_evolver.cli_common import build_hyperparameter_config_from_args
from darwinian_evolver.cli_common import register_hyperparameter_args
from darwinian_evolver.cli_common import parse_learning_log_view_type
from darwinian_evolver.evolve_problem_loop import EvolveProblemLoop
from darwinian_evolver.learning_log import LearningLogEntry
from darwinian_evolver.problem import EvaluationFailureCase
from darwinian_evolver.problem import EvaluationResult
from darwinian_evolver.problem import Evaluator
from darwinian_evolver.problem import Mutator
from darwinian_evolver.problem import Organism
from darwinian_evolver.problem import Problem
DEFAULT_MODEL = os.environ.get("EVOLVER_MODEL", "openai/gpt-4o-mini")
def _client() -> OpenAI:
key = os.environ.get("OPENROUTER_API_KEY")
if not key:
sys.exit("OPENROUTER_API_KEY is not set")
return OpenAI(api_key=key, base_url="https://openrouter.ai/api/v1")
def _prompt_llm(prompt: str) -> str:
try:
r = _client().chat.completions.create(
model=DEFAULT_MODEL,
max_tokens=1024,
messages=[{"role": "user", "content": prompt}],
)
return r.choices[0].message.content or ""
except Exception as e:
# Treat any provider error (rate limit, content filter, schema reject)
# as a failed response. The evolver will simply see this as a low score
# on this organism and move on — much friendlier than killing the run.
return f"<LLM_ERROR: {type(e).__name__}: {e}>"
class ParrotOrganism(Organism):
prompt_template: str
def run(self, phrase: str) -> str:
try:
prompt = jinja2.Template(self.prompt_template).render(phrase=phrase)
except jinja2.exceptions.TemplateError as e:
return f"Error rendering prompt: {e}"
if not prompt:
return ""
return _prompt_llm(prompt)
class ParrotEvaluationFailureCase(EvaluationFailureCase):
phrase: str
response: str
class ImproveParrotMutator(Mutator[ParrotOrganism, ParrotEvaluationFailureCase]):
IMPROVEMENT_PROMPT_TEMPLATE = """
We want to build a prompt that causes an LLM to repeat back a given phrase verbatim.
The current prompt template is:
```
{{ organism.prompt_template }}
```
Unfortunately, on this phrase:
```
{{ failure_case.phrase }}
```
the LLM responded with:
```
{{ failure_case.response }}
```
Diagnose what went wrong, then propose an improved prompt template. Put the new
template in the LAST triple-backtick block of your response.
""".strip()
def mutate(
self,
organism: ParrotOrganism,
failure_cases: list[ParrotEvaluationFailureCase],
learning_log_entries: list[LearningLogEntry],
) -> list[ParrotOrganism]:
fc = failure_cases[0]
prompt = jinja2.Template(self.IMPROVEMENT_PROMPT_TEMPLATE).render(
organism=organism, failure_case=fc
)
try:
resp = _prompt_llm(prompt)
parts = resp.split("```")
if len(parts) < 3:
return []
new_tpl = parts[-2].strip()
return [ParrotOrganism(prompt_template=new_tpl)]
except Exception as e:
print(f"mutate error: {e}", file=sys.stderr)
return []
class ParrotEvaluator(Evaluator[ParrotOrganism, EvaluationResult, ParrotEvaluationFailureCase]):
TRAINABLE_PHRASES = [
"Hello world.",
"bla",
"Bla",
"bla.",
'"bla bla".',
"Just say 'foo' once with no extra words.",
]
HOLDOUT_PHRASES = [
"bla, but only once.",
"'bla'",
]
def evaluate(self, organism: ParrotOrganism) -> EvaluationResult:
train_fails: list[ParrotEvaluationFailureCase] = []
hold_fails: list[ParrotEvaluationFailureCase] = []
for i, p in enumerate(self.TRAINABLE_PHRASES):
r = organism.run(p)
if r != p:
train_fails.append(ParrotEvaluationFailureCase(
phrase=p, response=r, data_point_id=f"trainable_{i}"))
for i, p in enumerate(self.HOLDOUT_PHRASES):
r = organism.run(p)
if r != p:
hold_fails.append(ParrotEvaluationFailureCase(
phrase=p, response=r, data_point_id=f"holdout_{i}"))
n_total = len(self.TRAINABLE_PHRASES) + len(self.HOLDOUT_PHRASES)
n_ok = n_total - len(train_fails) - len(hold_fails)
return EvaluationResult(
score=n_ok / n_total,
trainable_failure_cases=train_fails,
holdout_failure_cases=hold_fails,
# Always viable. Even a 0-score seed is a valid starting point; the
# mutator should still get a chance to fix it.
is_viable=True,
)
def make_problem() -> Problem:
return Problem[ParrotOrganism, EvaluationResult, ParrotEvaluationFailureCase](
evaluator=ParrotEvaluator(),
mutators=[ImproveParrotMutator()],
initial_organism=ParrotOrganism(prompt_template="Say {{ phrase }}"),
)
def main() -> int:
ap = argparse.ArgumentParser()
register_hyperparameter_args(ap.add_argument_group("hyperparameters"))
ap.add_argument("--num_iterations", type=int, default=3)
ap.add_argument("--mutator_concurrency", type=int, default=4)
ap.add_argument("--evaluator_concurrency", type=int, default=4)
ap.add_argument("--output_dir", type=str, required=True)
args = ap.parse_args()
out = Path(args.output_dir)
out.mkdir(parents=True, exist_ok=True)
hp = build_hyperparameter_config_from_args(args)
loop = EvolveProblemLoop(
problem=make_problem(),
learning_log_view_type=parse_learning_log_view_type(hp.learning_log_view_type),
num_parents_per_iteration=hp.num_parents_per_iteration,
mutator_concurrency=args.mutator_concurrency,
evaluator_concurrency=args.evaluator_concurrency,
fixed_midpoint_score=hp.fixed_midpoint_score,
midpoint_score_percentile=hp.midpoint_score_percentile,
sharpness=hp.sharpness,
novelty_weight=hp.novelty_weight,
batch_size=hp.batch_size,
should_verify_mutations=hp.verify_mutations,
)
import json
log_path = out / "results.jsonl"
snap_dir = out / "snapshots"
snap_dir.mkdir(exist_ok=True)
print("Evaluating initial organism...")
for snap in loop.run(num_iterations=args.num_iterations):
(snap_dir / f"iteration_{snap.iteration}.pkl").write_bytes(snap.snapshot)
_, best_eval = snap.best_organism_result
print(f"iter={snap.iteration} pop={snap.population_size} "
f"best_score={best_eval.score:.3f}")
with log_path.open("a") as f:
f.write(json.dumps({
"iteration": snap.iteration,
"best_score": best_eval.score,
"pop_size": snap.population_size,
"score_percentiles": {str(k): v for k, v in snap.score_percentiles.items()},
}) + "\n")
print(f"\nDone. Results in: {out}")
return 0
if __name__ == "__main__":
sys.exit(main())