perf(doctor): parallelize API connectivity checks and disable IMDS (#22766)

`hermes doctor` ran every connectivity probe sequentially and on a typical developer laptop spent ~2s of its ~5s wall time inside boto3's EC2 instance-metadata-service lookup (169.254.169.254) — the default AWS credential chain probes IMDS even when AWS_BEARER_TOKEN_BEDROCK or AWS_ACCESS_KEY_ID is the only legitimate source. Refactor the API Connectivity section so every probe (OpenRouter, Anthropic, ~16 static API-key providers + dynamic profiles, AWS Bedrock) is a pure function returning a structured result, then fan them out through a ThreadPoolExecutor(max_workers=8). Output order, glyphs, colours, padding, and issue strings stay byte-for-byte identical to the sequential implementation; results are gathered in submission order. Also disable IMDS for the parallel block by setting AWS_EC2_METADATA_DISABLED=true on the parent thread before submitting work (and restoring its prior value in a finally block). Bedrock's real-API call gets a Config(connect_timeout=5, read_timeout=10, retries={max_attempts:1}) so a transient regional failure can't pad the run by 30+ seconds. Measured impact (5-run medians, 9950X3D): hermes doctor: 5.07 → 2.16 s (-57%) Doctor tests: 48 passed (test_doctor.py + test_doctor_command_install.py). The remaining ~2s of wall is import overhead + a couple of one-off network calls outside the API Connectivity section (`fetch_models_dev` provider catalog refresh, Nous OAuth refresh in `Auth Providers`). Those are next-tier targets, not part of this change.
2026-05-18 04:41:56 +00:00 · 2026-05-09 13:03:20 -07:00 · 2026-05-09 13:03:20 -07:00 · e612c3d6f0
commit e612c3d6f0
parent 8f711f79a4
1 changed files with 301 additions and 146 deletions
--- a/hermes_cli/doctor.py
+++ b/hermes_cli/doctor.py
@ -1166,44 +1166,92 @@ def run_doctor(args):
    # =========================================================================
    print()
    print(color("◆ API Connectivity", Colors.CYAN, Colors.BOLD))
-    
-    openrouter_key = os.getenv("OPENROUTER_API_KEY")
-    if openrouter_key:
-        print("  Checking OpenRouter API...", end="", flush=True)
+
+    # Refactor: every connectivity probe below is HTTP-bound and fully
+    # independent. Running them in series spent ~5s wall on a typical
+    # workstation (2s of that was boto3's IMDS lookup for AWS credentials,
+    # which times out unless you're actually on EC2). Threading them with
+    # a small executor pool collapses the section to roughly the slowest
+    # single probe — about 2s — without changing the output format.
+    #
+    # Each ``_probe_*`` helper is a pure function: takes its inputs,
+    # makes one HTTP/SDK call, returns a ``_ConnectivityResult`` carrying
+    # the line(s) to print and any issue strings to append. No globals,
+    # no shared mutable state, no printing inside the workers.
+    import concurrent.futures as _futures
+    from collections import namedtuple as _namedtuple
+
+    _ConnectivityResult = _namedtuple(
+        "_ConnectivityResult", ["label", "lines", "issues"]
+    )
+    _probes: list = []  # list of (label, callable) submitted in display order
+
+    def _probe_openrouter() -> _ConnectivityResult:
+        key = os.getenv("OPENROUTER_API_KEY")
+        if not key:
+            return _ConnectivityResult(
+                "OpenRouter API",
+                [(color("⚠", Colors.YELLOW), "OpenRouter API",
+                  color("(not configured)", Colors.DIM))],
+                [],
+            )
        try:
            import httpx
-            response = httpx.get(
+            r = httpx.get(
                OPENROUTER_MODELS_URL,
-                headers={"Authorization": f"Bearer {openrouter_key}"},
-                timeout=10
+                headers={"Authorization": f"Bearer {key}"},
+                timeout=10,
            )
-            if response.status_code == 200:
-                print(f"\r  {color('✓', Colors.GREEN)} OpenRouter API                          ")
-            elif response.status_code == 401:
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(invalid API key)', Colors.DIM)}                ")
-                issues.append("Check OPENROUTER_API_KEY in .env")
-            elif response.status_code == 402:
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(out of credits — payment required)', Colors.DIM)}")
-                issues.append(
-                    "OpenRouter account has insufficient credits. "
-                    "Fix: run 'hermes config set model.provider <provider>' to switch providers, "
-                    "or fund your OpenRouter account at https://openrouter.ai/settings/credits"
+            if r.status_code == 200:
+                return _ConnectivityResult(
+                    "OpenRouter API",
+                    [(color("✓", Colors.GREEN), "OpenRouter API", "")],
+                    [],
                )
-            elif response.status_code == 429:
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color('(rate limited)', Colors.DIM)}                ")
-                issues.append("OpenRouter rate limit hit — consider switching to a different provider or waiting")
-            else:
-                print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color(f'(HTTP {response.status_code})', Colors.DIM)}                ")
+            if r.status_code == 401:
+                return _ConnectivityResult(
+                    "OpenRouter API",
+                    [(color("✗", Colors.RED), "OpenRouter API",
+                      color("(invalid API key)", Colors.DIM))],
+                    ["Check OPENROUTER_API_KEY in .env"],
+                )
+            if r.status_code == 402:
+                return _ConnectivityResult(
+                    "OpenRouter API",
+                    [(color("✗", Colors.RED), "OpenRouter API",
+                      color("(out of credits — payment required)", Colors.DIM))],
+                    ["OpenRouter account has insufficient credits. "
+                     "Fix: run 'hermes config set model.provider <provider>' "
+                     "to switch providers, or fund your OpenRouter account "
+                     "at https://openrouter.ai/settings/credits"],
+                )
+            if r.status_code == 429:
+                return _ConnectivityResult(
+                    "OpenRouter API",
+                    [(color("✗", Colors.RED), "OpenRouter API",
+                      color("(rate limited)", Colors.DIM))],
+                    ["OpenRouter rate limit hit — consider switching to "
+                     "a different provider or waiting"],
+                )
+            return _ConnectivityResult(
+                "OpenRouter API",
+                [(color("✗", Colors.RED), "OpenRouter API",
+                  color(f"(HTTP {r.status_code})", Colors.DIM))],
+                [],
+            )
        except Exception as e:
-            print(f"\r  {color('✗', Colors.RED)} OpenRouter API {color(f'({e})', Colors.DIM)}                ")
-            issues.append("Check network connectivity")
-    else:
-        check_warn("OpenRouter API", "(not configured)")
-    
-    from hermes_cli.auth import get_anthropic_key
-    anthropic_key = get_anthropic_key()
-    if anthropic_key:
-        print("  Checking Anthropic API...", end="", flush=True)
+            return _ConnectivityResult(
+                "OpenRouter API",
+                [(color("✗", Colors.RED), "OpenRouter API",
+                  color(f"({e})", Colors.DIM))],
+                ["Check network connectivity"],
+            )
+
+    def _probe_anthropic() -> _ConnectivityResult:
+        from hermes_cli.auth import get_anthropic_key
+        key = get_anthropic_key()
+        if not key:
+            return _ConnectivityResult("Anthropic API", [], [])
        try:
            import httpx
            from agent.anthropic_adapter import (
@ -1212,140 +1260,247 @@ def run_doctor(args):
                _OAUTH_ONLY_BETAS,
                _CONTEXT_1M_BETA,
            )
-
            headers = {"anthropic-version": "2023-06-01"}
-            is_oauth = _is_oauth_token(anthropic_key)
+            is_oauth = _is_oauth_token(key)
            if is_oauth:
-                headers["Authorization"] = f"Bearer {anthropic_key}"
+                headers["Authorization"] = f"Bearer {key}"
                headers["anthropic-beta"] = ",".join(_COMMON_BETAS + _OAUTH_ONLY_BETAS)
            else:
-                headers["x-api-key"] = anthropic_key
-            response = httpx.get(
+                headers["x-api-key"] = key
+            r = httpx.get(
                "https://api.anthropic.com/v1/models",
-                headers=headers,
-                timeout=10
+                headers=headers, timeout=10,
            )
-            # Reactive recovery: OAuth subscriptions that don't include 1M
-            # context reject the request with 400 "long context beta is not
-            # yet available for this subscription". Retry once with that
-            # beta stripped so the doctor check doesn't falsely report the
-            # Anthropic API as unreachable for those users.
+            # Reactive recovery: OAuth subscriptions without 1M context reject the
+            # request with 400 "long context beta is not yet available for this
+            # subscription". Retry once with that beta stripped so the doctor
+            # check doesn't falsely report Anthropic as unreachable.
            if (
                is_oauth
-                and response.status_code == 400
-                and "long context beta" in response.text.lower()
-                and "not yet available" in response.text.lower()
+                and r.status_code == 400
+                and "long context beta" in r.text.lower()
+                and "not yet available" in r.text.lower()
            ):
                headers["anthropic-beta"] = ",".join(
-                    [b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA] + list(_OAUTH_ONLY_BETAS)
+                    [b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA]
+                    + list(_OAUTH_ONLY_BETAS)
                )
-                response = httpx.get(
+                r = httpx.get(
                    "https://api.anthropic.com/v1/models",
-                    headers=headers,
-                    timeout=10,
+                    headers=headers, timeout=10,
                )
-            if response.status_code == 200:
-                print(f"\r  {color('✓', Colors.GREEN)} Anthropic API                           ")
-            elif response.status_code == 401:
-                print(f"\r  {color('✗', Colors.RED)} Anthropic API {color('(invalid API key)', Colors.DIM)}                 ")
-            else:
-                msg = "(couldn't verify)"
-                print(f"\r  {color('⚠', Colors.YELLOW)} Anthropic API {color(msg, Colors.DIM)}                 ")
+            if r.status_code == 200:
+                return _ConnectivityResult(
+                    "Anthropic API",
+                    [(color("✓", Colors.GREEN), "Anthropic API", "")],
+                    [],
+                )
+            if r.status_code == 401:
+                return _ConnectivityResult(
+                    "Anthropic API",
+                    [(color("✗", Colors.RED), "Anthropic API",
+                      color("(invalid API key)", Colors.DIM))],
+                    [],
+                )
+            return _ConnectivityResult(
+                "Anthropic API",
+                [(color("⚠", Colors.YELLOW), "Anthropic API",
+                  color("(couldn't verify)", Colors.DIM))],
+                [],
+            )
        except Exception as e:
-            print(f"\r  {color('⚠', Colors.YELLOW)} Anthropic API {color(f'({e})', Colors.DIM)}                 ")
+            return _ConnectivityResult(
+                "Anthropic API",
+                [(color("⚠", Colors.YELLOW), "Anthropic API",
+                  color(f"({e})", Colors.DIM))],
+                [],
+            )
+
+    def _probe_apikey_provider(pname, env_vars, default_url, base_env,
+                               supports_health_check) -> _ConnectivityResult:
+        key = ""
+        for ev in env_vars:
+            key = os.getenv(ev, "")
+            if key:
+                break
+        if not key:
+            return _ConnectivityResult(pname, [], [])
+        label = pname.ljust(20)
+        if not supports_health_check:
+            return _ConnectivityResult(
+                pname,
+                [(color("✓", Colors.GREEN), label,
+                  color("(key configured)", Colors.DIM))],
+                [],
+            )
+        try:
+            import httpx
+            base = os.getenv(base_env, "") if base_env else ""
+            # Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
+            # (OpenAI-compat surface, which exposes /models for health check).
+            if not base and key.startswith("sk-kimi-"):
+                base = "https://api.kimi.com/coding/v1"
+            # Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
+            # with no /v1) don't support /models. Rewrite to OpenAI-compat
+            # /v1 surface for health checks.
+            if base and base.rstrip("/").endswith("/anthropic"):
+                from agent.auxiliary_client import _to_openai_base_url
+                base = _to_openai_base_url(base)
+            if base_url_host_matches(base, "api.kimi.com") and base.rstrip("/").endswith("/coding"):
+                base = base.rstrip("/") + "/v1"
+            url = (base.rstrip("/") + "/models") if base else default_url
+            headers = {
+                "Authorization": f"Bearer {key}",
+                "User-Agent": _HERMES_USER_AGENT,
+            }
+            if base_url_host_matches(base, "api.kimi.com"):
+                headers["User-Agent"] = "claude-code/0.1.0"
+            r = httpx.get(url, headers=headers, timeout=10)
+            if (
+                pname == "Alibaba/DashScope"
+                and not base
+                and r.status_code == 401
+            ):
+                r = httpx.get(
+                    "https://dashscope.aliyuncs.com/compatible-mode/v1/models",
+                    headers=headers, timeout=10,
+                )
+            if r.status_code == 200:
+                return _ConnectivityResult(
+                    pname,
+                    [(color("✓", Colors.GREEN), label, "")],
+                    [],
+                )
+            if r.status_code == 401:
+                return _ConnectivityResult(
+                    pname,
+                    [(color("✗", Colors.RED), label,
+                      color("(invalid API key)", Colors.DIM))],
+                    [f"Check {env_vars[0]} in .env"],
+                )
+            return _ConnectivityResult(
+                pname,
+                [(color("⚠", Colors.YELLOW), label,
+                  color(f"(HTTP {r.status_code})", Colors.DIM))],
+                [],
+            )
+        except Exception as e:
+            return _ConnectivityResult(
+                pname,
+                [(color("⚠", Colors.YELLOW), label,
+                  color(f"({e})", Colors.DIM))],
+                [],
+            )
+
+    def _probe_bedrock() -> _ConnectivityResult:
+        try:
+            from agent.bedrock_adapter import (
+                has_aws_credentials,
+                resolve_aws_auth_env_var,
+                resolve_bedrock_region,
+            )
+        except ImportError:
+            return _ConnectivityResult("AWS Bedrock", [], [])
+        if not has_aws_credentials():
+            return _ConnectivityResult("AWS Bedrock", [], [])
+        auth_var = resolve_aws_auth_env_var()
+        region = resolve_bedrock_region()
+        label = "AWS Bedrock".ljust(20)
+        try:
+            import boto3
+            from botocore.config import Config as _BotoConfig
+            # Trim retries on the actual Bedrock API call so a transient
+            # failure doesn't pad the doctor run by 30+ seconds.
+            cfg = _BotoConfig(
+                connect_timeout=5,
+                read_timeout=10,
+                retries={"max_attempts": 1},
+            )
+            client = boto3.client("bedrock", region_name=region, config=cfg)
+            resp = client.list_foundation_models()
+            n = len(resp.get("modelSummaries", []))
+            return _ConnectivityResult(
+                "AWS Bedrock",
+                [(color("✓", Colors.GREEN), label,
+                  color(f"({auth_var}, {region}, {n} models)", Colors.DIM))],
+                [],
+            )
+        except ImportError:
+            return _ConnectivityResult(
+                "AWS Bedrock",
+                [(color("⚠", Colors.YELLOW), label,
+                  color(f"(boto3 not installed — {sys.executable} -m pip install boto3)",
+                        Colors.DIM))],
+                [f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3"],
+            )
+        except Exception as e:
+            err_name = type(e).__name__
+            return _ConnectivityResult(
+                "AWS Bedrock",
+                [(color("⚠", Colors.YELLOW), label,
+                  color(f"({err_name}: {e})", Colors.DIM))],
+                [f"AWS Bedrock: {err_name} — check IAM permissions for "
+                 f"bedrock:ListFoundationModels"],
+            )
+
+    # Build the probe submission list in display order
+    _probes.append(("OpenRouter API", _probe_openrouter))
+    _probes.append(("Anthropic API", _probe_anthropic))

-    # -- API-key providers --
-    # Tuple: (name, env_vars, default_url, base_env, supports_models_endpoint)
-    # If supports_models_endpoint is False, we skip the health check and just show "configured"
-    # Cached at module level after first build — profiles auto-extend it.
    global _APIKEY_PROVIDERS_CACHE
    if _APIKEY_PROVIDERS_CACHE is None:
        _APIKEY_PROVIDERS_CACHE = _build_apikey_providers_list()
-    _apikey_providers = _APIKEY_PROVIDERS_CACHE
-    for _pname, _env_vars, _default_url, _base_env, _supports_health_check in _apikey_providers:
-        _key = ""
-        for _ev in _env_vars:
-            _key = os.getenv(_ev, "")
-            if _key:
-                break
-        if _key:
-            _label = _pname.ljust(20)
-            # Some providers (like MiniMax) don't support /models endpoint
-            if not _supports_health_check:
-                print(f"  {color('✓', Colors.GREEN)} {_label} {color('(key configured)', Colors.DIM)}")
-                continue
-            print(f"  Checking {_pname} API...", end="", flush=True)
-            try:
-                import httpx
-                _base = os.getenv(_base_env, "") if _base_env else ""
-                # Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
-                # (OpenAI-compat surface, which exposes /models for health check).
-                if not _base and _key.startswith("sk-kimi-"):
-                    _base = "https://api.kimi.com/coding/v1"
-                # Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
-                # with no /v1) don't support /models.  Rewrite to the OpenAI-compat
-                # /v1 surface for health checks.
-                if _base and _base.rstrip("/").endswith("/anthropic"):
-                    from agent.auxiliary_client import _to_openai_base_url
-                    _base = _to_openai_base_url(_base)
-                if base_url_host_matches(_base, "api.kimi.com") and _base.rstrip("/").endswith("/coding"):
-                    _base = _base.rstrip("/") + "/v1"
-                _url = (_base.rstrip("/") + "/models") if _base else _default_url
-                _headers = {
-                    "Authorization": f"Bearer {_key}",
-                    "User-Agent": _HERMES_USER_AGENT,
-                }
-                if base_url_host_matches(_base, "api.kimi.com"):
-                    _headers["User-Agent"] = "claude-code/0.1.0"
-                _resp = httpx.get(
-                    _url,
-                    headers=_headers,
-                    timeout=10,
-                )
-                if (
-                    _pname == "Alibaba/DashScope"
-                    and not _base
-                    and _resp.status_code == 401
-                ):
-                    _resp = httpx.get(
-                        "https://dashscope.aliyuncs.com/compatible-mode/v1/models",
-                        headers=_headers,
-                        timeout=10,
-                    )
-                if _resp.status_code == 200:
-                    print(f"\r  {color('✓', Colors.GREEN)} {_label}                          ")
-                elif _resp.status_code == 401:
-                    print(f"\r  {color('✗', Colors.RED)} {_label} {color('(invalid API key)', Colors.DIM)}           ")
-                    issues.append(f"Check {_env_vars[0]} in .env")
-                else:
-                    print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'(HTTP {_resp.status_code})', Colors.DIM)}           ")
-            except Exception as _e:
-                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_e})', Colors.DIM)}           ")
+    for _entry in _APIKEY_PROVIDERS_CACHE:
+        _pname, _env_vars, _default_url, _base_env, _supports = _entry
+        # Capture loop vars by binding default args — without this, all closures
+        # would share the final iteration's values and every probe would hit
+        # the last provider's URL.
+        _probes.append((_pname, lambda p=_pname, e=_env_vars, u=_default_url,
+                                       b=_base_env, s=_supports:
+                                _probe_apikey_provider(p, e, u, b, s)))

-    # -- AWS Bedrock --
-    # Bedrock uses the AWS SDK credential chain, not API keys.
+    _probes.append(("AWS Bedrock", _probe_bedrock))
+
+    # Print a single status line so users see something happening, then
+    # fan out. ``\r`` clears it once the first real result line lands.
+    print(f"  {color(f'Running {len(_probes)} connectivity checks in parallel…', Colors.DIM)}",
+          end="", flush=True)
+
+    # Disable boto3's EC2 instance-metadata-service probe for the duration
+    # of the parallel block. boto's default credential chain tries
+    # 169.254.169.254 with a multi-second timeout when we're not on EC2,
+    # which dominated the section's wall time before this fix
+    # (~2s on a developer laptop, even with the rest parallelized).
+    # Set on the parent thread before submitting work so the env-var
+    # mutation never races with another worker. has_aws_credentials() in
+    # the bedrock probe already gates on real env-var creds, so IMDS is
+    # never the legitimate source for `hermes doctor`.
+    _imds_prev = os.environ.get("AWS_EC2_METADATA_DISABLED")
+    os.environ["AWS_EC2_METADATA_DISABLED"] = "true"
    try:
-        from agent.bedrock_adapter import has_aws_credentials, resolve_aws_auth_env_var, resolve_bedrock_region
-        if has_aws_credentials():
-            _auth_var = resolve_aws_auth_env_var()
-            _region = resolve_bedrock_region()
-            _label = "AWS Bedrock".ljust(20)
-            print(f"  Checking AWS Bedrock...", end="", flush=True)
-            try:
-                import boto3
-                _br_client = boto3.client("bedrock", region_name=_region)
-                _br_resp = _br_client.list_foundation_models()
-                _model_count = len(_br_resp.get("modelSummaries", []))
-                print(f"\r  {color('✓', Colors.GREEN)} {_label} {color(f'({_auth_var}, {_region}, {_model_count} models)', Colors.DIM)}           ")
-            except ImportError:
-                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'(boto3 not installed — {sys.executable} -m pip install boto3)', Colors.DIM)}           ")
-                issues.append(f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3")
-            except Exception as _e:
-                _err_name = type(_e).__name__
-                print(f"\r  {color('⚠', Colors.YELLOW)} {_label} {color(f'({_err_name}: {_e})', Colors.DIM)}           ")
-                issues.append(f"AWS Bedrock: {_err_name} — check IAM permissions for bedrock:ListFoundationModels")
-    except ImportError:
-        pass  # bedrock_adapter not available — skip silently
+        # 8 workers is plenty — each probe is a single HTTP call plus a TLS
+        # handshake. More than that wastes thread-startup cost and risks
+        # noisy output if anything ever printed from inside a worker.
+        with _futures.ThreadPoolExecutor(max_workers=8,
+                                         thread_name_prefix="doctor-probe") as _ex:
+            _futures_in_order = [_ex.submit(_fn) for _, _fn in _probes]
+            _results = [_f.result() for _f in _futures_in_order]
+    finally:
+        if _imds_prev is None:
+            os.environ.pop("AWS_EC2_METADATA_DISABLED", None)
+        else:
+            os.environ["AWS_EC2_METADATA_DISABLED"] = _imds_prev
+
+    # Clear the "Running …" line and print all results in submission order.
+    print("\r" + " " * 70 + "\r", end="")
+    for _r in _results:
+        for _glyph, _label, _detail in _r.lines:
+            if _detail:
+                print(f"  {_glyph} {_label} {_detail}")
+            else:
+                print(f"  {_glyph} {_label}")
+        for _issue in _r.issues:
+            issues.append(_issue)

    # =========================================================================
    # Check: Submodules