hermes-agent/tests/plugins/image_gen/check_parity_vs_main.py

"""Behavior-parity check for the image-gen FAL plugin migration (#26241).

Spawns one subprocess per (version, scenario) cell — pinned to either
``origin/main`` (legacy in-tree FAL fall-through + ``configured == "fal"``
skip in ``_dispatch_to_plugin_provider``) or this PR's worktree (FAL is
itself a plugin and the dispatcher routes every set provider through
the registry). Each subprocess clears all FAL-related env vars + writes
a ``config.yaml``, then asks the dispatcher how it would route an
``image_generate`` call. The emitted shape tuple is
``{dispatch_kind, provider_name, model}``:

* ``dispatch_kind`` ∈ ``{"legacy_fal", "plugin", "error", None}`` —
  whether the call would go straight to the in-tree pipeline,
  through ``_dispatch_to_plugin_provider``, raise an explicit
  provider-not-registered error, or fall through silently.
* ``provider_name`` — when ``dispatch_kind == "plugin"``, the
  resolved provider name. ``None`` otherwise.
* ``model`` — the resolved FAL model id when applicable.

The parent process diffs the shapes per scenario. A diff means the
migration introduced an observable behaviour change vs origin/main —
likely a real regression for users on the existing config keys.

Run from the PR worktree:

    python tests/plugins/image_gen/check_parity_vs_main.py
"""
from __future__ import annotations

import json
import subprocess
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).resolve().parents[3]


# Pin one path to current main, one to the PR worktree.
# ``REPO_ROOT`` is ``.../.worktrees/<name>``; the main checkout lives
# two levels up. When running directly from a regular clone (no
# worktree), ``MAIN_DIR`` falls back to a sibling ``hermes-agent-main``
# checkout if one exists.
def _resolve_main_dir() -> Path:
    candidate = REPO_ROOT.parent.parent
    if (candidate / "tools" / "image_generation_tool.py").exists() and candidate != REPO_ROOT:
        return candidate
    sibling = REPO_ROOT.parent / "hermes-agent-main"
    if (sibling / "tools" / "image_generation_tool.py").exists():
        return sibling
    return REPO_ROOT


MAIN_DIR = _resolve_main_dir()
PR_DIR = REPO_ROOT
assert (PR_DIR / "tools" / "image_generation_tool.py").exists(), (
    f"PR_DIR={PR_DIR} doesn't look like a hermes-agent checkout"
)


SUBPROCESS_SCRIPT = r"""
import json, os, sys, tempfile
sys.path.insert(0, sys.argv[1])

# Isolated HERMES_HOME so the config write is hermetic.
home = tempfile.mkdtemp()
os.environ["HERMES_HOME"] = home

# Clear FAL-related env so dispatch decisions are config-driven.
for k in (
    "FAL_KEY", "FAL_QUEUE_GATEWAY_URL",
    "TOOL_GATEWAY_DOMAIN", "TOOL_GATEWAY_USER_TOKEN",
    "FAL_IMAGE_MODEL",
):
    os.environ.pop(k, None)

scenario_env = json.loads(sys.argv[2])
os.environ.update(scenario_env)

config_yaml = sys.argv[3]
config_path = os.path.join(home, "config.yaml")
with open(config_path, "w") as f:
    f.write(config_yaml)

# Fresh import — must not have anything cached.
for name in list(sys.modules):
    if (name.startswith("tools.")
            or name.startswith("agent.")
            or name.startswith("plugins.")
            or name.startswith("hermes_cli.")):
        sys.modules.pop(name, None)

import tools.image_generation_tool as image_tool

dispatch_kind = None
provider_name = None
model = None
error_text = None

try:
    raw = image_tool._dispatch_to_plugin_provider("ping", "landscape")
    if raw is None:
        dispatch_kind = "legacy_fal"
    else:
        parsed = json.loads(raw) if isinstance(raw, str) else raw
        if isinstance(parsed, dict):
            if parsed.get("error_type") == "provider_not_registered":
                dispatch_kind = "error"
                error_text = parsed.get("error")
            else:
                dispatch_kind = "plugin"
                provider_name = parsed.get("provider")
                model = parsed.get("model")
        else:
            dispatch_kind = "unknown_payload"

    if model is None:
        # _resolve_fal_model still returns the active FAL model id even
        # when dispatch goes to a non-FAL plugin — used for the diff
        # only when applicable.
        try:
            model_id, _meta = image_tool._resolve_fal_model()
            if dispatch_kind == "legacy_fal":
                model = model_id
        except Exception:
            pass
except Exception as exc:
    dispatch_kind = "exception"
    error_text = repr(exc)

shape = {
    "dispatch_kind": dispatch_kind,
    "provider_name": provider_name,
    "model": model,
    "error_present": error_text is not None,
}
print(json.dumps(shape))
"""


SCENARIOS: list[tuple[str, str, dict[str, str]]] = [
    # (label, config.yaml body, extra env vars)
    ("no-config-no-env", "", {}),
    (
        "explicit-fal-no-creds",
        "image_gen:\n  provider: fal\n",
        {},
    ),
    (
        "explicit-fal-with-creds",
        "image_gen:\n  provider: fal\n",
        {"FAL_KEY": "test-key"},
    ),
    (
        "explicit-fal-with-model",
        "image_gen:\n  provider: fal\n  model: fal-ai/flux-2-pro\n",
        {"FAL_KEY": "test-key"},
    ),
    (
        "explicit-typo-provider",
        "image_gen:\n  provider: not-a-real-backend\n",
        {"FAL_KEY": "test-key"},
    ),
    (
        "managed-gateway-only",
        "",
        {
            "TOOL_GATEWAY_DOMAIN": "nousresearch.com",
            "TOOL_GATEWAY_USER_TOKEN": "nous-token",
        },
    ),
]


def _run_scenario(repo_path: Path, label: str, config_yaml: str, env: dict) -> dict:
    venv_python = repo_path / ".venv" / "bin" / "python"
    if not venv_python.exists():
        venv_python = MAIN_DIR / ".venv" / "bin" / "python"
    if not venv_python.exists():
        venv_python = Path("python3")

    out = subprocess.run(
        [
            str(venv_python),
            "-c",
            SUBPROCESS_SCRIPT,
            str(repo_path),
            json.dumps(env),
            config_yaml,
        ],
        capture_output=True,
        text=True,
        timeout=60,
    )
    if out.returncode != 0:
        return {
            "error": "subprocess failed",
            "stdout": out.stdout[-500:],
            "stderr": out.stderr[-500:],
        }
    try:
        return json.loads(out.stdout.strip().splitlines()[-1])
    except Exception as exc:
        return {"error": f"could not parse output: {exc}", "stdout": out.stdout}


def _reduce(shape: dict) -> dict:
    """Reduce to the parts that matter for user-visible parity.

    On origin/main, ``explicit-fal-*`` scenarios short-circuit to
    ``legacy_fal`` because of the ``configured == "fal"`` skip. On the
    PR, those same scenarios route through the plugin and emit
    ``dispatch_kind == "plugin"`` with ``provider_name == "fal"``.

    Both shapes are functionally equivalent — the plugin's ``generate()``
    re-enters the same in-tree pipeline via ``_it`` indirection — but
    we want the diff to be visible so reviewers can sign off on the
    intentional behaviour delta.
    """
    return {
        "dispatch_kind": shape.get("dispatch_kind"),
        "provider_name": shape.get("provider_name"),
        "model": shape.get("model"),
        "error_present": shape.get("error_present"),
    }


def main() -> int:
    print(f"main:    {MAIN_DIR}")
    print(f"pr:      {PR_DIR}")
    print()

    if MAIN_DIR == PR_DIR:
        print(
            "WARN: MAIN_DIR == PR_DIR — diffs will be trivially identical.\n"
            "      Set up a sibling 'hermes-agent-main' checkout pinned to "
            "origin/main to get real parity coverage."
        )
        print()

    failures: list[str] = []
    errors: list[str] = []
    intentional_diffs: list[tuple[str, dict, dict]] = []
    for label, config_yaml, env in SCENARIOS:
        main_shape = _run_scenario(MAIN_DIR, label, config_yaml, env)
        pr_shape = _run_scenario(PR_DIR, label, config_yaml, env)

        if "error" in main_shape or "error" in pr_shape:
            print(f"  [ERR ] {label}: subprocess failed")
            print(f"    main: {main_shape}")
            print(f"    pr:   {pr_shape}")
            errors.append(label)
            continue

        main_reduced = _reduce(main_shape)
        pr_reduced = _reduce(pr_shape)

        if main_reduced == pr_reduced:
            print(f"  [OK]   {label}: {main_reduced}")
            continue

        # On main, "explicit-fal-*" returns legacy_fal; on PR, plugin
        # dispatch. That's the only acceptable diff — flag everything
        # else as a regression.
        legacy_to_plugin_fal = (
            main_reduced.get("dispatch_kind") == "legacy_fal"
            and pr_reduced.get("dispatch_kind") == "plugin"
            and pr_reduced.get("provider_name") == "fal"
        )
        if legacy_to_plugin_fal:
            print(f"  [DIFF] {label}: legacy_fal → plugin (fal) — expected")
            intentional_diffs.append((label, main_reduced, pr_reduced))
        else:
            print(f"  [FAIL] {label}")
            print(f"    main: {main_reduced}")
            print(f"    pr:   {pr_reduced}")
            failures.append(label)

    print()
    if errors:
        print(f"SUBPROCESS ERRORS in {len(errors)} scenario(s):")
        for e in errors:
            print(f"  - {e}")
    if failures:
        print(f"BEHAVIOUR REGRESSION in {len(failures)} scenario(s):")
        for f in failures:
            print(f"  - {f}")
    if intentional_diffs:
        print(
            f"INTENTIONAL DIFFS ({len(intentional_diffs)}): "
            f"legacy_fal → plugin dispatch for explicit FAL paths."
        )
    if failures or errors:
        return 1
    print(f"PARITY OK across {len(SCENARIOS)} scenarios.")
    return 0


if __name__ == "__main__":
    sys.exit(main())