perf(doctor): parallelize API connectivity checks and disable IMDS (#22766)

`hermes doctor` ran every connectivity probe sequentially and on a typical developer laptop spent ~2s of its ~5s wall time inside boto3's EC2 instance-metadata-service lookup (169.254.169.254) — the default AWS credential chain probes IMDS even when AWS_BEARER_TOKEN_BEDROCK or AWS_ACCESS_KEY_ID is the only legitimate source. Refactor the API Connectivity section so every probe (OpenRouter, Anthropic, ~16 static API-key providers + dynamic profiles, AWS Bedrock) is a pure function returning a structured result, then fan them out through a ThreadPoolExecutor(max_workers=8). Output order, glyphs, colours, padding, and issue strings stay byte-for-byte identical to the sequential implementation; results are gathered in submission order. Also disable IMDS for the parallel block by setting AWS_EC2_METADATA_DISABLED=true on the parent thread before submitting work (and restoring its prior value in a finally block). Bedrock's real-API call gets a Config(connect_timeout=5, read_timeout=10, retries={max_attempts:1}) so a transient regional failure can't pad the run by 30+ seconds. Measured impact (5-run medians, 9950X3D): hermes doctor: 5.07 → 2.16 s (-57%) Doctor tests: 48 passed (test_doctor.py + test_doctor_command_install.py). The remaining ~2s of wall is import overhead + a couple of one-off network calls outside the API Connectivity section (`fetch_models_dev` provider catalog refresh, Nous OAuth refresh in `Auth Providers`). Those are next-tier targets, not part of this change.
2026-05-24 05:41:40 +00:00 · 2026-05-09 13:03:20 -07:00 · 2026-05-09 13:03:20 -07:00 · e612c3d6f0
commit e612c3d6f0
parent 8f711f79a4
1 changed files with 301 additions and 146 deletions
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@ -1166,44 +1166,92 @@ def run_doctor(args):
    # =========================================================================
    print()
    print(color("◆ API Connectivity", Colors.CYAN, Colors.BOLD))
-    
+
-    openrouter_key = os.getenv("OPENROUTER_API_KEY")
+    # Refactor: every connectivity probe below is HTTP-bound and fully
-    if openrouter_key:
+    # independent. Running them in series spent ~5s wall on a typical
-        print("  Checking OpenRouter API...", end="", flush=True)
+    # workstation (2s of that was boto3's IMDS lookup for AWS credentials,
    # which times out unless you're actually on EC2). Threading them with
    # a small executor pool collapses the section to roughly the slowest
    # single probe — about 2s — without changing the output format.
    #
    # Each ``_probe_*`` helper is a pure function: takes its inputs,
    # makes one HTTP/SDK call, returns a ``_ConnectivityResult`` carrying
    # the line(s) to print and any issue strings to append. No globals,
    # no shared mutable state, no printing inside the workers.
    import concurrent.futures as _futures
    from collections import namedtuple as _namedtuple
    _ConnectivityResult = _namedtuple(
        "_ConnectivityResult", ["label", "lines", "issues"]
    )
    _probes: list = []  # list of (label, callable) submitted in display order
    def _probe_openrouter() -> _ConnectivityResult:
        key = os.getenv("OPENROUTER_API_KEY")
        if not key:
            return _ConnectivityResult(
                "OpenRouter API",
                [(color("⚠", Colors.YELLOW), "OpenRouter API",
                  color("(not configured)", Colors.DIM))],
                [],
            )
        try:
            import httpx
-            response = httpx.get(
+            r = httpx.get(
                OPENROUTER_MODELS_URL,
-                headers={"Authorization": f"Bearer {openrouter_key}"},
+                headers={"Authorization": f"Bearer {key}"},
-                timeout=10
+                timeout=10,
            )
-            if response.status_code == 200:
+            if r.status_code == 200:
-                print(f"\r  {color('✓', Colors.GREEN)} OpenRouter API                          ")
+                return _ConnectivityResult(
-            elif response.status_code == 401:
+                    "OpenRouter API",
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(invalid API key)', Colors.DIM)}                ")
+                    [(color("✓", Colors.GREEN), "OpenRouter API", "")],
-                issues.append("Check OPENROUTER_API_KEY in .env")
+                    [],
            elif response.status_code == 402:
                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(out of credits — payment required)', Colors.DIM)}")
                issues.append(
                    "OpenRouter account has insufficient credits. "
                    "Fix: run 'hermes config set model.provider <provider>' to switch providers, "
                    "or fund your OpenRouter account at https://openrouter.ai/settings/credits"
                )
-            elif response.status_code == 429:
+            if r.status_code == 401:
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(rate limited)', Colors.DIM)}                ")
+                return _ConnectivityResult(
-                issues.append("OpenRouter rate limit hit — consider switching to a different provider or waiting")
+                    "OpenRouter API",
-            else:
+                    [(color("✗", Colors.RED), "OpenRouter API",
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color(f'(HTTP {response.status_code})', Colors.DIM)}                ")
+                      color("(invalid API key)", Colors.DIM))],
                    ["Check OPENROUTER_API_KEY in .env"],
                )
            if r.status_code == 402:
                return _ConnectivityResult(
                    "OpenRouter API",
                    [(color("✗", Colors.RED), "OpenRouter API",
                      color("(out of credits — payment required)", Colors.DIM))],
                    ["OpenRouter account has insufficient credits. "
                     "Fix: run 'hermes config set model.provider <provider>' "
                     "to switch providers, or fund your OpenRouter account "
                     "at https://openrouter.ai/settings/credits"],
                )
            if r.status_code == 429:
                return _ConnectivityResult(
                    "OpenRouter API",
                    [(color("✗", Colors.RED), "OpenRouter API",
                      color("(rate limited)", Colors.DIM))],
                    ["OpenRouter rate limit hit — consider switching to "
                     "a different provider or waiting"],
                )
            return _ConnectivityResult(
                "OpenRouter API",
                [(color("✗", Colors.RED), "OpenRouter API",
                  color(f"(HTTP {r.status_code})", Colors.DIM))],
                [],
            )
        except Exception as e:
-            print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color(f'({e})', Colors.DIM)}                ")
+            return _ConnectivityResult(
-            issues.append("Check network connectivity")
+                "OpenRouter API",
-    else:
+                [(color("✗", Colors.RED), "OpenRouter API",
-        check_warn("OpenRouter API", "(not configured)")
+                  color(f"({e})", Colors.DIM))],
-    
+                ["Check network connectivity"],
-    from hermes_cli.auth import get_anthropic_key
+            )
-    anthropic_key = get_anthropic_key()
+
-    if anthropic_key:
+    def _probe_anthropic() -> _ConnectivityResult:
-        print("  Checking Anthropic API...", end="", flush=True)
+        from hermes_cli.auth import get_anthropic_key
        key = get_anthropic_key()
        if not key:
            return _ConnectivityResult("Anthropic API", [], [])
        try:
            import httpx
            from agent.anthropic_adapter import (
@ -1212,140 +1260,247 @@ def run_doctor(args):
                _OAUTH_ONLY_BETAS,
                _CONTEXT_1M_BETA,
            )
            headers = {"anthropic-version": "2023-06-01"}
-            is_oauth = _is_oauth_token(anthropic_key)
+            is_oauth = _is_oauth_token(key)
            if is_oauth:
-                headers["Authorization"] = f"Bearer {anthropic_key}"
+                headers["Authorization"] = f"Bearer {key}"
                headers["anthropic-beta"] = ",".join(_COMMON_BETAS + _OAUTH_ONLY_BETAS)
            else:
-                headers["x-api-key"] = anthropic_key
+                headers["x-api-key"] = key
-            response = httpx.get(
+            r = httpx.get(
                "https://api.anthropic.com/v1/models",
-                headers=headers,
+                headers=headers, timeout=10,
                timeout=10
            )
-            # Reactive recovery: OAuth subscriptions that don't include 1M
+            # Reactive recovery: OAuth subscriptions without 1M context reject the
-            # context reject the request with 400 "long context beta is not
+            # request with 400 "long context beta is not yet available for this
-            # yet available for this subscription". Retry once with that
+            # subscription". Retry once with that beta stripped so the doctor
-            # beta stripped so the doctor check doesn't falsely report the
+            # check doesn't falsely report Anthropic as unreachable.
            # Anthropic API as unreachable for those users.
            if (
                is_oauth
-                and response.status_code == 400
+                and r.status_code == 400
-                and "long context beta" in response.text.lower()
+                and "long context beta" in r.text.lower()
-                and "not yet available" in response.text.lower()
+                and "not yet available" in r.text.lower()
            ):
                headers["anthropic-beta"] = ",".join(
-                    [b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA] + list(_OAUTH_ONLY_BETAS)
+                    [b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA]
                    + list(_OAUTH_ONLY_BETAS)
                )
-                response = httpx.get(
+                r = httpx.get(
                    "https://api.anthropic.com/v1/models",
-                    headers=headers,
+                    headers=headers, timeout=10,
                    timeout=10,
                )
-            if response.status_code == 200:
+            if r.status_code == 200:
-                print(f"\r  {color('✓', Colors.GREEN)} Anthropic API                           ")
+                return _ConnectivityResult(
-            elif response.status_code == 401:
+                    "Anthropic API",
-                print(f"\r  {color('✗', Colors.RED)} Anthropic API {color('(invalid API key)', Colors.DIM)}                 ")
+                    [(color("✓", Colors.GREEN), "Anthropic API", "")],
-            else:
+                    [],
-                msg = "(couldn't verify)"
+                )
-                print(f"\r  {color('⚠', Colors.YELLOW)} Anthropic API {color(msg, Colors.DIM)}                 ")
+            if r.status_code == 401:
                return _ConnectivityResult(
                    "Anthropic API",
                    [(color("✗", Colors.RED), "Anthropic API",
                      color("(invalid API key)", Colors.DIM))],
                    [],
                )
            return _ConnectivityResult(
                "Anthropic API",
                [(color("⚠", Colors.YELLOW), "Anthropic API",
                  color("(couldn't verify)", Colors.DIM))],
                [],
            )
        except Exception as e:
-            print(f"\r  {color('⚠', Colors.YELLOW)} Anthropic API {color(f'({e})', Colors.DIM)}                 ")
+            return _ConnectivityResult(
                "Anthropic API",
                [(color("⚠", Colors.YELLOW), "Anthropic API",
                  color(f"({e})", Colors.DIM))],
                [],
            )
    def _probe_apikey_provider(pname, env_vars, default_url, base_env,
                               supports_health_check) -> _ConnectivityResult:
        key = ""
        for ev in env_vars:
            key = os.getenv(ev, "")
            if key:
                break
        if not key:
            return _ConnectivityResult(pname, [], [])
        label = pname.ljust(20)
        if not supports_health_check:
            return _ConnectivityResult(
                pname,
                [(color("✓", Colors.GREEN), label,
                  color("(key configured)", Colors.DIM))],
                [],
            )
        try:
            import httpx
            base = os.getenv(base_env, "") if base_env else ""
            # Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
            # (OpenAI-compat surface, which exposes /models for health check).
            if not base and key.startswith("sk-kimi-"):
                base = "https://api.kimi.com/coding/v1"
            # Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
            # with no /v1) don't support /models. Rewrite to OpenAI-compat
            # /v1 surface for health checks.
            if base and base.rstrip("/").endswith("/anthropic"):
                from agent.auxiliary_client import _to_openai_base_url
                base = _to_openai_base_url(base)
            if base_url_host_matches(base, "api.kimi.com") and base.rstrip("/").endswith("/coding"):
                base = base.rstrip("/") + "/v1"
            url = (base.rstrip("/") + "/models") if base else default_url
            headers = {
                "Authorization": f"Bearer {key}",
                "User-Agent": _HERMES_USER_AGENT,
            }
            if base_url_host_matches(base, "api.kimi.com"):
                headers["User-Agent"] = "claude-code/0.1.0"
            r = httpx.get(url, headers=headers, timeout=10)
            if (
                pname == "Alibaba/DashScope"
                and not base
                and r.status_code == 401
            ):
                r = httpx.get(
                    "https://dashscope.aliyuncs.com/compatible-mode/v1/models",
                    headers=headers, timeout=10,
                )
            if r.status_code == 200:
                return _ConnectivityResult(
                    pname,
                    [(color("✓", Colors.GREEN), label, "")],
                    [],
                )
            if r.status_code == 401:
                return _ConnectivityResult(
                    pname,
                    [(color("✗", Colors.RED), label,
                      color("(invalid API key)", Colors.DIM))],
                    [f"Check {env_vars[0]} in .env"],
                )
            return _ConnectivityResult(
                pname,
                [(color("⚠", Colors.YELLOW), label,
                  color(f"(HTTP {r.status_code})", Colors.DIM))],
                [],
            )
        except Exception as e:
            return _ConnectivityResult(
                pname,
                [(color("⚠", Colors.YELLOW), label,
                  color(f"({e})", Colors.DIM))],
                [],
            )
    def _probe_bedrock() -> _ConnectivityResult:
        try:
            from agent.bedrock_adapter import (
                has_aws_credentials,
                resolve_aws_auth_env_var,
                resolve_bedrock_region,
            )
        except ImportError:
            return _ConnectivityResult("AWS Bedrock", [], [])
        if not has_aws_credentials():
            return _ConnectivityResult("AWS Bedrock", [], [])
        auth_var = resolve_aws_auth_env_var()
        region = resolve_bedrock_region()
        label = "AWS Bedrock".ljust(20)
        try:
            import boto3
            from botocore.config import Config as _BotoConfig
            # Trim retries on the actual Bedrock API call so a transient
            # failure doesn't pad the doctor run by 30+ seconds.
            cfg = _BotoConfig(
                connect_timeout=5,
                read_timeout=10,
                retries={"max_attempts": 1},
            )
            client = boto3.client("bedrock", region_name=region, config=cfg)
            resp = client.list_foundation_models()
            n = len(resp.get("modelSummaries", []))
            return _ConnectivityResult(
                "AWS Bedrock",
                [(color("✓", Colors.GREEN), label,
                  color(f"({auth_var}, {region}, {n} models)", Colors.DIM))],
                [],
            )
        except ImportError:
            return _ConnectivityResult(
                "AWS Bedrock",
                [(color("⚠", Colors.YELLOW), label,
                  color(f"(boto3 not installed — {sys.executable} -m pip install boto3)",
                        Colors.DIM))],
                [f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3"],
            )
        except Exception as e:
            err_name = type(e).__name__
            return _ConnectivityResult(
                "AWS Bedrock",
                [(color("⚠", Colors.YELLOW), label,
                  color(f"({err_name}: {e})", Colors.DIM))],
                [f"AWS Bedrock: {err_name} — check IAM permissions for "
                 f"bedrock:ListFoundationModels"],
            )
    # Build the probe submission list in display order
    _probes.append(("OpenRouter API", _probe_openrouter))
    _probes.append(("Anthropic API", _probe_anthropic))
    # -- API-key providers --
    # Tuple: (name, env_vars, default_url, base_env, supports_models_endpoint)
    # If supports_models_endpoint is False, we skip the health check and just show "configured"
    # Cached at module level after first build — profiles auto-extend it.
    global _APIKEY_PROVIDERS_CACHE
    if _APIKEY_PROVIDERS_CACHE is None:
        _APIKEY_PROVIDERS_CACHE = _build_apikey_providers_list()
-    _apikey_providers = _APIKEY_PROVIDERS_CACHE
+    for _entry in _APIKEY_PROVIDERS_CACHE:
-    for _pname, _env_vars, _default_url, _base_env, _supports_health_check in _apikey_providers:
+        _pname, _env_vars, _default_url, _base_env, _supports = _entry
-        _key = ""
+        # Capture loop vars by binding default args — without this, all closures
-        for _ev in _env_vars:
+        # would share the final iteration's values and every probe would hit
-            _key = os.getenv(_ev, "")
+        # the last provider's URL.
-            if _key:
+        _probes.append((_pname, lambda p=_pname, e=_env_vars, u=_default_url,
-                break
+                                       b=_base_env, s=_supports:
-        if _key:
+                                _probe_apikey_provider(p, e, u, b, s)))
            _label = _pname.ljust(20)
            # Some providers (like MiniMax) don't support /models endpoint
            if not _supports_health_check:
                print(f"  {color('✓', Colors.GREEN)} {_label} {color('(key configured)', Colors.DIM)}")
                continue
            print(f"  Checking {_pname} API...", end="", flush=True)
            try:
                import httpx
                _base = os.getenv(_base_env, "") if _base_env else ""
                # Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
                # (OpenAI-compat surface, which exposes /models for health check).
                if not _base and _key.startswith("sk-kimi-"):
                    _base = "https://api.kimi.com/coding/v1"
                # Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
                # with no /v1) don't support /models.  Rewrite to the OpenAI-compat
                # /v1 surface for health checks.
                if _base and _base.rstrip("/").endswith("/anthropic"):
                    from agent.auxiliary_client import _to_openai_base_url
                    _base = _to_openai_base_url(_base)
                if base_url_host_matches(_base, "api.kimi.com") and _base.rstrip("/").endswith("/coding"):
                    _base = _base.rstrip("/") + "/v1"
                _url = (_base.rstrip("/") + "/models") if _base else _default_url
                _headers = {
                    "Authorization": f"Bearer {_key}",
                    "User-Agent": _HERMES_USER_AGENT,
                }
                if base_url_host_matches(_base, "api.kimi.com"):
                    _headers["User-Agent"] = "claude-code/0.1.0"
                _resp = httpx.get(
                    _url,
                    headers=_headers,
                    timeout=10,
                )
                if (
                    _pname == "Alibaba/DashScope"
                    and not _base
                    and _resp.status_code == 401
                ):
                    _resp = httpx.get(
                        "https://dashscope.aliyuncs.com/compatible-mode/v1/models",
                        headers=_headers,
                        timeout=10,
                    )
                if _resp.status_code == 200:
                    print(f"\r  {color('✓', Colors.GREEN)} {_label}                          ")
                elif _resp.status_code == 401:
                    print(f"\r  {color('✗', Colors.RED)} {_label} {color('(invalid API key)', Colors.DIM)}           ")
                    issues.append(f"Check {_env_vars[0]} in .env")
                else:
                    print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'(HTTP {_resp.status_code})', Colors.DIM)}           ")
            except Exception as _e:
                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_e})', Colors.DIM)}           ")
-    # -- AWS Bedrock --
+    _probes.append(("AWS Bedrock", _probe_bedrock))
-    # Bedrock uses the AWS SDK credential chain, not API keys.
+
    # Print a single status line so users see something happening, then
    # fan out. ``\r`` clears it once the first real result line lands.
    print(f"  {color(f'Running {len(_probes)} connectivity checks in parallel…', Colors.DIM)}",
          end="", flush=True)
    # Disable boto3's EC2 instance-metadata-service probe for the duration
    # of the parallel block. boto's default credential chain tries
    # 169.254.169.254 with a multi-second timeout when we're not on EC2,
    # which dominated the section's wall time before this fix
    # (~2s on a developer laptop, even with the rest parallelized).
    # Set on the parent thread before submitting work so the env-var
    # mutation never races with another worker. has_aws_credentials() in
    # the bedrock probe already gates on real env-var creds, so IMDS is
    # never the legitimate source for `hermes doctor`.
    _imds_prev = os.environ.get("AWS_EC2_METADATA_DISABLED")
    os.environ["AWS_EC2_METADATA_DISABLED"] = "true"
    try:
-        from agent.bedrock_adapter import has_aws_credentials, resolve_aws_auth_env_var, resolve_bedrock_region
+        # 8 workers is plenty — each probe is a single HTTP call plus a TLS
-        if has_aws_credentials():
+        # handshake. More than that wastes thread-startup cost and risks
-            _auth_var = resolve_aws_auth_env_var()
+        # noisy output if anything ever printed from inside a worker.
-            _region = resolve_bedrock_region()
+        with _futures.ThreadPoolExecutor(max_workers=8,
-            _label = "AWS Bedrock".ljust(20)
+                                         thread_name_prefix="doctor-probe") as _ex:
-            print(f"  Checking AWS Bedrock...", end="", flush=True)
+            _futures_in_order = [_ex.submit(_fn) for _, _fn in _probes]
-            try:
+            _results = [_f.result() for _f in _futures_in_order]
-                import boto3
+    finally:
-                _br_client = boto3.client("bedrock", region_name=_region)
+        if _imds_prev is None:
-                _br_resp = _br_client.list_foundation_models()
+            os.environ.pop("AWS_EC2_METADATA_DISABLED", None)
-                _model_count = len(_br_resp.get("modelSummaries", []))
+        else:
-                print(f"\r  {color('✓', Colors.GREEN)} {_label} {color(f'({_auth_var}, {_region}, {_model_count} models)', Colors.DIM)}           ")
+            os.environ["AWS_EC2_METADATA_DISABLED"] = _imds_prev
-            except ImportError:
+
-                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'(boto3 not installed — {sys.executable} -m pip install boto3)', Colors.DIM)}           ")
+    # Clear the "Running …" line and print all results in submission order.
-                issues.append(f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3")
+    print("\r" + " " * 70 + "\r", end="")
-            except Exception as _e:
+    for _r in _results:
-                _err_name = type(_e).__name__
+        for _glyph, _label, _detail in _r.lines:
-                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_err_name}: {_e})', Colors.DIM)}           ")
+            if _detail:
-                issues.append(f"AWS Bedrock: {_err_name} — check IAM permissions for bedrock:ListFoundationModels")
+                print(f"  {_glyph} {_label} {_detail}")
-    except ImportError:
+            else:
-        pass  # bedrock_adapter not available — skip silently
+                print(f"  {_glyph} {_label}")
        for _issue in _r.issues:
            issues.append(_issue)
    # =========================================================================
    # Check: Submodules