feat(skill): darwinian-evolver optional skill

Thin wrapper around Imbue's darwinian_evolver (AGPL-3.0, subprocess-only). Ships a working OpenRouter driver (parrot_openrouter.py), a snapshot inspector (show_snapshot.py), and a custom-problem template. SKILL.md has 58-char description, Pitfalls sourced from actually running the loop: non-viable seed trap, Azure content filter killing runs, loop.run() being a generator, nested-pickle snapshots, and aggressive default concurrency. Salvaged from #12719 by @Bihruze — original PR shipped 12,289 LOC across 61 files (29 Python modules, FastAPI dashboard, VS Code extension, benchmark hub, marketplace, etc.) which was far beyond the scope of the underlying issue (#336). This version stays at the ~700-LOC scope that issue actually asked for. Authorship of the original effort credited via AUTHOR_MAP entry and the SKILL.md author field. Verified end-to-end: seed 'Say {{ phrase }}' (score 0.000) evolved into 'Please repeat the following phrase exactly as it is, without any modifications or additional formatting: {{ phrase }}' (score 0.750) across 3 iterations on gpt-4o-mini via OpenRouter. Co-authored-by: Bihruze <98262967+Bihruze@users.noreply.github.com>
2026-05-24 05:41:40 +00:00 · 2026-05-15 21:54:56 -07:00 · 2026-05-15 21:54:56 -07:00 · c9b32a654c
commit c9b32a654c
parent e377833fa6
5 changed files with 828 additions and 0 deletions
--- a/optional-skills/research/darwinian-evolver/scripts/parrot_openrouter.py
+++ b/optional-skills/research/darwinian-evolver/scripts/parrot_openrouter.py
@ -0,0 +1,218 @@
+"""
+parrot_openrouter: same as the upstream `parrot` example but the LLM call goes
+through OpenRouter (OpenAI SDK) instead of Anthropic native. Lets us run an
+end-to-end evolution with whatever model the user already has paid access to.
+
+Run with:
+    uv --project darwinian_evolver run python parrot_openrouter.py \
+        --num_iterations 3 --output_dir /tmp/parrot_out
+
+Reads `OPENROUTER_API_KEY` from the environment.
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+import jinja2
+from openai import OpenAI
+
+# Vendored problem types from upstream (AGPL — only run via subprocess in production)
+from darwinian_evolver.cli_common import build_hyperparameter_config_from_args
+from darwinian_evolver.cli_common import register_hyperparameter_args
+from darwinian_evolver.cli_common import parse_learning_log_view_type
+from darwinian_evolver.evolve_problem_loop import EvolveProblemLoop
+from darwinian_evolver.learning_log import LearningLogEntry
+from darwinian_evolver.problem import EvaluationFailureCase
+from darwinian_evolver.problem import EvaluationResult
+from darwinian_evolver.problem import Evaluator
+from darwinian_evolver.problem import Mutator
+from darwinian_evolver.problem import Organism
+from darwinian_evolver.problem import Problem
+
+DEFAULT_MODEL = os.environ.get("EVOLVER_MODEL", "openai/gpt-4o-mini")
+
+
+def _client() -> OpenAI:
+    key = os.environ.get("OPENROUTER_API_KEY")
+    if not key:
+        sys.exit("OPENROUTER_API_KEY is not set")
+    return OpenAI(api_key=key, base_url="https://openrouter.ai/api/v1")
+
+
+def _prompt_llm(prompt: str) -> str:
+    try:
+        r = _client().chat.completions.create(
+            model=DEFAULT_MODEL,
+            max_tokens=1024,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return r.choices[0].message.content or ""
+    except Exception as e:
+        # Treat any provider error (rate limit, content filter, schema reject)
+        # as a failed response. The evolver will simply see this as a low score
+        # on this organism and move on — much friendlier than killing the run.
+        return f"<LLM_ERROR: {type(e).__name__}: {e}>"
+
+
+class ParrotOrganism(Organism):
+    prompt_template: str
+
+    def run(self, phrase: str) -> str:
+        try:
+            prompt = jinja2.Template(self.prompt_template).render(phrase=phrase)
+        except jinja2.exceptions.TemplateError as e:
+            return f"Error rendering prompt: {e}"
+        if not prompt:
+            return ""
+        return _prompt_llm(prompt)
+
+
+class ParrotEvaluationFailureCase(EvaluationFailureCase):
+    phrase: str
+    response: str
+
+
+class ImproveParrotMutator(Mutator[ParrotOrganism, ParrotEvaluationFailureCase]):
+    IMPROVEMENT_PROMPT_TEMPLATE = """
+We want to build a prompt that causes an LLM to repeat back a given phrase verbatim.
+
+The current prompt template is:
+```
+{{ organism.prompt_template }}
+```
+
+Unfortunately, on this phrase:
+```
+{{ failure_case.phrase }}
+```
+the LLM responded with:
+```
+{{ failure_case.response }}
+```
+
+Diagnose what went wrong, then propose an improved prompt template. Put the new
+template in the LAST triple-backtick block of your response.
+""".strip()
+
+    def mutate(
+        self,
+        organism: ParrotOrganism,
+        failure_cases: list[ParrotEvaluationFailureCase],
+        learning_log_entries: list[LearningLogEntry],
+    ) -> list[ParrotOrganism]:
+        fc = failure_cases[0]
+        prompt = jinja2.Template(self.IMPROVEMENT_PROMPT_TEMPLATE).render(
+            organism=organism, failure_case=fc
+        )
+        try:
+            resp = _prompt_llm(prompt)
+            parts = resp.split("```")
+            if len(parts) < 3:
+                return []
+            new_tpl = parts[-2].strip()
+            return [ParrotOrganism(prompt_template=new_tpl)]
+        except Exception as e:
+            print(f"mutate error: {e}", file=sys.stderr)
+            return []
+
+
+class ParrotEvaluator(Evaluator[ParrotOrganism, EvaluationResult, ParrotEvaluationFailureCase]):
+    TRAINABLE_PHRASES = [
+        "Hello world.",
+        "bla",
+        "Bla",
+        "bla.",
+        '"bla bla".',
+        "Just say 'foo' once with no extra words.",
+    ]
+    HOLDOUT_PHRASES = [
+        "bla, but only once.",
+        "'bla'",
+    ]
+
+    def evaluate(self, organism: ParrotOrganism) -> EvaluationResult:
+        train_fails: list[ParrotEvaluationFailureCase] = []
+        hold_fails: list[ParrotEvaluationFailureCase] = []
+        for i, p in enumerate(self.TRAINABLE_PHRASES):
+            r = organism.run(p)
+            if r != p:
+                train_fails.append(ParrotEvaluationFailureCase(
+                    phrase=p, response=r, data_point_id=f"trainable_{i}"))
+        for i, p in enumerate(self.HOLDOUT_PHRASES):
+            r = organism.run(p)
+            if r != p:
+                hold_fails.append(ParrotEvaluationFailureCase(
+                    phrase=p, response=r, data_point_id=f"holdout_{i}"))
+        n_total = len(self.TRAINABLE_PHRASES) + len(self.HOLDOUT_PHRASES)
+        n_ok = n_total - len(train_fails) - len(hold_fails)
+        return EvaluationResult(
+            score=n_ok / n_total,
+            trainable_failure_cases=train_fails,
+            holdout_failure_cases=hold_fails,
+            # Always viable. Even a 0-score seed is a valid starting point; the
+            # mutator should still get a chance to fix it.
+            is_viable=True,
+        )
+
+
+def make_problem() -> Problem:
+    return Problem[ParrotOrganism, EvaluationResult, ParrotEvaluationFailureCase](
+        evaluator=ParrotEvaluator(),
+        mutators=[ImproveParrotMutator()],
+        initial_organism=ParrotOrganism(prompt_template="Say {{ phrase }}"),
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    register_hyperparameter_args(ap.add_argument_group("hyperparameters"))
+    ap.add_argument("--num_iterations", type=int, default=3)
+    ap.add_argument("--mutator_concurrency", type=int, default=4)
+    ap.add_argument("--evaluator_concurrency", type=int, default=4)
+    ap.add_argument("--output_dir", type=str, required=True)
+    args = ap.parse_args()
+
+    out = Path(args.output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+
+    hp = build_hyperparameter_config_from_args(args)
+    loop = EvolveProblemLoop(
+        problem=make_problem(),
+        learning_log_view_type=parse_learning_log_view_type(hp.learning_log_view_type),
+        num_parents_per_iteration=hp.num_parents_per_iteration,
+        mutator_concurrency=args.mutator_concurrency,
+        evaluator_concurrency=args.evaluator_concurrency,
+        fixed_midpoint_score=hp.fixed_midpoint_score,
+        midpoint_score_percentile=hp.midpoint_score_percentile,
+        sharpness=hp.sharpness,
+        novelty_weight=hp.novelty_weight,
+        batch_size=hp.batch_size,
+        should_verify_mutations=hp.verify_mutations,
+    )
+
+    import json
+    log_path = out / "results.jsonl"
+    snap_dir = out / "snapshots"
+    snap_dir.mkdir(exist_ok=True)
+    print("Evaluating initial organism...")
+    for snap in loop.run(num_iterations=args.num_iterations):
+        (snap_dir / f"iteration_{snap.iteration}.pkl").write_bytes(snap.snapshot)
+        _, best_eval = snap.best_organism_result
+        print(f"iter={snap.iteration} pop={snap.population_size} "
+              f"best_score={best_eval.score:.3f}")
+        with log_path.open("a") as f:
+            f.write(json.dumps({
+                "iteration": snap.iteration,
+                "best_score": best_eval.score,
+                "pop_size": snap.population_size,
+                "score_percentiles": {str(k): v for k, v in snap.score_percentiles.items()},
+            }) + "\n")
+    print(f"\nDone. Results in: {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/optional-skills/research/darwinian-evolver/scripts/show_snapshot.py
+++ b/optional-skills/research/darwinian-evolver/scripts/show_snapshot.py
@ -0,0 +1,69 @@
+"""
+show_snapshot.py — Dump the population from a darwinian-evolver snapshot pickle.
+
+Usage:
+    python show_snapshot.py PATH/TO/iteration_N.pkl [--field prompt_template]
+
+The script is intentionally Organism-agnostic: it walks `org.__dict__` and prints
+all str fields. By default it shows `prompt_template` if present; pass --field to
+target a different attribute (e.g. `regex_pattern`, `sql_query`, `code_block`).
+"""
+from __future__ import annotations
+
+import argparse
+import pickle
+import sys
+from pathlib import Path
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("snapshot", type=Path)
+    ap.add_argument(
+        "--field",
+        default=None,
+        help="Organism attribute to display. Defaults to the first str field found.",
+    )
+    ap.add_argument("--top", type=int, default=None, help="Show only top N by score.")
+    args = ap.parse_args()
+
+    if not args.snapshot.exists():
+        sys.exit(f"snapshot not found: {args.snapshot}")
+
+    # The outer pickle wraps a dict; the inner pickle contains the actual organism
+    # objects, which must be importable under their original dotted path. If you
+    # ran a custom driver, make sure its module is on sys.path before calling this.
+    outer = pickle.loads(args.snapshot.read_bytes())
+    if not isinstance(outer, dict) or "population_snapshot" not in outer:
+        sys.exit("not a darwinian-evolver snapshot (no population_snapshot key)")
+    inner = pickle.loads(outer["population_snapshot"])
+    pairs = inner["organisms"]  # list of (Organism, EvaluationResult)
+
+    print(f"# organisms: {len(pairs)}\n")
+    ranked = sorted(pairs, key=lambda p: getattr(p[1], "score", 0) or 0, reverse=True)
+    if args.top:
+        ranked = ranked[: args.top]
+
+    for i, (org, res) in enumerate(ranked):
+        score = getattr(res, "score", float("nan"))
+        print(f"=== rank {i} score={score:.3f} ===")
+        # pick field
+        field = args.field
+        if field is None:
+            for k, v in vars(org).items():
+                if isinstance(v, str) and not k.startswith("_") and k not in ("id",):
+                    field = k
+                    break
+        val = getattr(org, field, None) if field else None
+        if val is None:
+            print(f"  (no string field; org fields: {list(vars(org).keys())})")
+        else:
+            print(f"  {field} ({len(val)} chars):")
+            for ln in val.splitlines()[:30]:
+                print(f"    {ln}")
+        print()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())