feat(skill): darwinian-evolver optional skill

Thin wrapper around Imbue's darwinian_evolver (AGPL-3.0, subprocess-only). Ships a working OpenRouter driver (parrot_openrouter.py), a snapshot inspector (show_snapshot.py), and a custom-problem template. SKILL.md has 58-char description, Pitfalls sourced from actually running the loop: non-viable seed trap, Azure content filter killing runs, loop.run() being a generator, nested-pickle snapshots, and aggressive default concurrency. Salvaged from #12719 by @Bihruze — original PR shipped 12,289 LOC across 61 files (29 Python modules, FastAPI dashboard, VS Code extension, benchmark hub, marketplace, etc.) which was far beyond the scope of the underlying issue (#336). This version stays at the ~700-LOC scope that issue actually asked for. Authorship of the original effort credited via AUTHOR_MAP entry and the SKILL.md author field. Verified end-to-end: seed 'Say {{ phrase }}' (score 0.000) evolved into 'Please repeat the following phrase exactly as it is, without any modifications or additional formatting: {{ phrase }}' (score 0.750) across 3 iterations on gpt-4o-mini via OpenRouter. Co-authored-by: Bihruze <98262967+Bihruze@users.noreply.github.com>
2026-05-25 05:52:34 +00:00 · 2026-05-15 21:54:56 -07:00 · 2026-05-15 21:54:56 -07:00 · c9b32a654c
commit c9b32a654c
parent e377833fa6
5 changed files with 828 additions and 0 deletions
--- a/optional-skills/research/darwinian-evolver/templates/custom_problem_template.py
+++ b/optional-skills/research/darwinian-evolver/templates/custom_problem_template.py
@ -0,0 +1,240 @@
+"""
+Template: a custom darwinian-evolver problem.
+
+Copy this file, fill in the THREE marked spots (Organism, Evaluator, Mutator),
+then run it as a driver script. The skeleton handles all the wiring so you only
+write the domain-specific logic.
+
+To run:
+    cd ~/.hermes/cache/darwinian-evolver/darwinian_evolver
+    OPENROUTER_API_KEY=... uv run --with openai python /path/to/this_file.py \
+        --num_iterations 3 --num_parents_per_iteration 2 \
+        --output_dir /tmp/my_problem
+
+The pattern mirrors `scripts/parrot_openrouter.py` (the working reference).
+"""
+from __future__ import annotations
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+from openai import OpenAI
+
+# Upstream types (AGPL — invoked via subprocess in production; importing here
+# is fine for skill-side driver scripts the user owns).
+from darwinian_evolver.cli_common import (
+    build_hyperparameter_config_from_args,
+    parse_learning_log_view_type,
+    register_hyperparameter_args,
+)
+from darwinian_evolver.evolve_problem_loop import EvolveProblemLoop
+from darwinian_evolver.learning_log import LearningLogEntry
+from darwinian_evolver.problem import (
+    EvaluationFailureCase,
+    EvaluationResult,
+    Evaluator,
+    Mutator,
+    Organism,
+    Problem,
+)
+
+DEFAULT_MODEL = os.environ.get("EVOLVER_MODEL", "openai/gpt-4o-mini")
+
+
+def _client() -> OpenAI:
+    key = os.environ.get("OPENROUTER_API_KEY")
+    if not key:
+        sys.exit("OPENROUTER_API_KEY is not set")
+    return OpenAI(api_key=key, base_url="https://openrouter.ai/api/v1")
+
+
+def _prompt_llm(prompt: str, max_tokens: int = 1024) -> str:
+    try:
+        r = _client().chat.completions.create(
+            model=DEFAULT_MODEL,
+            max_tokens=max_tokens,
+            messages=[{"role": "user", "content": prompt}],
+        )
+        return r.choices[0].message.content or ""
+    except Exception as e:
+        # Never let one bad LLM response kill the run.
+        return f"<LLM_ERROR: {type(e).__name__}: {e}>"
+
+
+# ---------------------------------------------------------------------------
+# 1. ORGANISM — what you are evolving.
+# ---------------------------------------------------------------------------
+class MyOrganism(Organism):
+    # TODO: replace with your artifact field. Common shapes:
+    #   prompt_template: str
+    #   regex_pattern: str
+    #   sql_query: str
+    #   code_block: str
+    artifact: str
+
+    def run(self, *inputs) -> str:
+        """Exercise the organism on a test input. Return whatever your
+        evaluator wants to score."""
+        # TODO: implement. For prompt evolution this typically calls _prompt_llm
+        # with the artifact rendered against the input. For regex/SQL it would
+        # call `re.findall(self.artifact, input)` / execute SQL / etc.
+        raise NotImplementedError
+
+
+# ---------------------------------------------------------------------------
+# 2. EVALUATOR — score organisms and surface failures the mutator can learn from.
+# ---------------------------------------------------------------------------
+class MyFailureCase(EvaluationFailureCase):
+    # TODO: include enough context for the LLM to diagnose the failure.
+    input: str
+    expected: str
+    actual: str
+
+
+class MyEvaluator(Evaluator[MyOrganism, EvaluationResult, MyFailureCase]):
+    # Split your dataset. Mutator only sees trainable; holdout detects overfitting.
+    TRAINABLE = [
+        # TODO: list of (input, expected) tuples
+        # ("input1", "expected1"),
+    ]
+    HOLDOUT = [
+        # TODO: separate set the mutator never sees
+    ]
+
+    def evaluate(self, organism: MyOrganism) -> EvaluationResult:
+        train_fails: list[MyFailureCase] = []
+        hold_fails: list[MyFailureCase] = []
+        for i, (inp, expected) in enumerate(self.TRAINABLE):
+            actual = organism.run(inp)
+            if actual != expected:
+                train_fails.append(MyFailureCase(
+                    input=inp, expected=expected, actual=actual,
+                    data_point_id=f"trainable_{i}",
+                ))
+        for i, (inp, expected) in enumerate(self.HOLDOUT):
+            actual = organism.run(inp)
+            if actual != expected:
+                hold_fails.append(MyFailureCase(
+                    input=inp, expected=expected, actual=actual,
+                    data_point_id=f"holdout_{i}",
+                ))
+        n_total = len(self.TRAINABLE) + len(self.HOLDOUT)
+        n_ok = n_total - len(train_fails) - len(hold_fails)
+        return EvaluationResult(
+            score=n_ok / n_total if n_total else 0.0,
+            trainable_failure_cases=train_fails,
+            holdout_failure_cases=hold_fails,
+            # Always-viable. The evolver only blocks completely-broken organisms;
+            # a 0-score organism is fine and will simply be sampled less often.
+            is_viable=True,
+        )
+
+
+# ---------------------------------------------------------------------------
+# 3. MUTATOR — LLM proposes an improved organism from a failure case.
+# ---------------------------------------------------------------------------
+class MyMutator(Mutator[MyOrganism, MyFailureCase]):
+    PROMPT = """
+The current artifact is:
+```
+{artifact}
+```
+
+On this input:
+```
+{input}
+```
+it produced:
+```
+{actual}
+```
+but we wanted:
+```
+{expected}
+```
+
+Diagnose what went wrong, then propose an improved version of the artifact.
+Put the new version in the LAST triple-backtick block of your response.
+""".strip()
+
+    def mutate(
+        self,
+        organism: MyOrganism,
+        failure_cases: list[MyFailureCase],
+        learning_log_entries: list[LearningLogEntry],
+    ) -> list[MyOrganism]:
+        fc = failure_cases[0]
+        prompt = self.PROMPT.format(
+            artifact=organism.artifact,
+            input=fc.input,
+            actual=fc.actual,
+            expected=fc.expected,
+        )
+        resp = _prompt_llm(prompt)
+        parts = resp.split("```")
+        if len(parts) < 3:
+            return []
+        new_artifact = parts[-2].strip()
+        # Strip an opening language tag like "python\n" or "sql\n"
+        if "\n" in new_artifact:
+            first_line, rest = new_artifact.split("\n", 1)
+            if first_line and not first_line.startswith(" ") and len(first_line) < 20:
+                new_artifact = rest
+        return [MyOrganism(artifact=new_artifact)]
+
+
+# ---------------------------------------------------------------------------
+# Driver — fills in the EvolveProblemLoop boilerplate. You shouldn't need to
+# touch anything below this line for a typical run.
+# ---------------------------------------------------------------------------
+def make_problem() -> Problem:
+    initial = MyOrganism(artifact="TODO: starting artifact here")  # TODO
+    return Problem[MyOrganism, EvaluationResult, MyFailureCase](
+        evaluator=MyEvaluator(),
+        mutators=[MyMutator()],
+        initial_organism=initial,
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    register_hyperparameter_args(ap.add_argument_group("hyperparameters"))
+    ap.add_argument("--num_iterations", type=int, default=3)
+    ap.add_argument("--mutator_concurrency", type=int, default=2)
+    ap.add_argument("--evaluator_concurrency", type=int, default=2)
+    ap.add_argument("--output_dir", type=str, required=True)
+    args = ap.parse_args()
+
+    out = Path(args.output_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    (out / "snapshots").mkdir(exist_ok=True)
+
+    hp = build_hyperparameter_config_from_args(args)
+    loop = EvolveProblemLoop(
+        problem=make_problem(),
+        learning_log_view_type=parse_learning_log_view_type(hp.learning_log_view_type),
+        num_parents_per_iteration=hp.num_parents_per_iteration,
+        mutator_concurrency=args.mutator_concurrency,
+        evaluator_concurrency=args.evaluator_concurrency,
+        fixed_midpoint_score=hp.fixed_midpoint_score,
+        midpoint_score_percentile=hp.midpoint_score_percentile,
+        sharpness=hp.sharpness,
+        novelty_weight=hp.novelty_weight,
+        batch_size=hp.batch_size,
+        should_verify_mutations=hp.verify_mutations,
+    )
+
+    print("Evaluating initial organism...")
+    for snap in loop.run(num_iterations=args.num_iterations):
+        (out / "snapshots" / f"iteration_{snap.iteration}.pkl").write_bytes(snap.snapshot)
+        _, best = snap.best_organism_result
+        print(f"iter={snap.iteration} pop={snap.population_size} best_score={best.score:.3f}")
+
+    print(f"\nDone. Results in: {out}")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())