perf(doctor): parallelize API connectivity checks and disable IMDS (#22766)

`hermes doctor` ran every connectivity probe sequentially and on a typical
developer laptop spent ~2s of its ~5s wall time inside boto3's EC2
instance-metadata-service lookup (169.254.169.254) — the default
AWS credential chain probes IMDS even when AWS_BEARER_TOKEN_BEDROCK
or AWS_ACCESS_KEY_ID is the only legitimate source.

Refactor the API Connectivity section so every probe (OpenRouter,
Anthropic, ~16 static API-key providers + dynamic profiles, AWS
Bedrock) is a pure function returning a structured result, then
fan them out through a ThreadPoolExecutor(max_workers=8). Output
order, glyphs, colours, padding, and issue strings stay byte-for-byte
identical to the sequential implementation; results are gathered
in submission order.

Also disable IMDS for the parallel block by setting
AWS_EC2_METADATA_DISABLED=true on the parent thread before submitting
work (and restoring its prior value in a finally block). Bedrock's
real-API call gets a Config(connect_timeout=5, read_timeout=10,
retries={max_attempts:1}) so a transient regional failure can't pad
the run by 30+ seconds.

Measured impact (5-run medians, 9950X3D):
  hermes doctor:           5.07 → 2.16 s  (-57%)

Doctor tests: 48 passed (test_doctor.py + test_doctor_command_install.py).

The remaining ~2s of wall is import overhead + a couple of one-off
network calls outside the API Connectivity section (`fetch_models_dev`
provider catalog refresh, Nous OAuth refresh in `Auth Providers`).
Those are next-tier targets, not part of this change.
This commit is contained in:
Teknium 2026-05-09 13:03:20 -07:00 committed by GitHub
parent 8f711f79a4
commit e612c3d6f0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1166,44 +1166,92 @@ def run_doctor(args):
# =========================================================================
print()
print(color("◆ API Connectivity", Colors.CYAN, Colors.BOLD))
openrouter_key = os.getenv("OPENROUTER_API_KEY")
if openrouter_key:
print(" Checking OpenRouter API...", end="", flush=True)
# Refactor: every connectivity probe below is HTTP-bound and fully
# independent. Running them in series spent ~5s wall on a typical
# workstation (2s of that was boto3's IMDS lookup for AWS credentials,
# which times out unless you're actually on EC2). Threading them with
# a small executor pool collapses the section to roughly the slowest
# single probe — about 2s — without changing the output format.
#
# Each ``_probe_*`` helper is a pure function: takes its inputs,
# makes one HTTP/SDK call, returns a ``_ConnectivityResult`` carrying
# the line(s) to print and any issue strings to append. No globals,
# no shared mutable state, no printing inside the workers.
import concurrent.futures as _futures
from collections import namedtuple as _namedtuple
_ConnectivityResult = _namedtuple(
"_ConnectivityResult", ["label", "lines", "issues"]
)
_probes: list = [] # list of (label, callable) submitted in display order
def _probe_openrouter() -> _ConnectivityResult:
key = os.getenv("OPENROUTER_API_KEY")
if not key:
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.YELLOW), "OpenRouter API",
color("(not configured)", Colors.DIM))],
[],
)
try:
import httpx
response = httpx.get(
r = httpx.get(
OPENROUTER_MODELS_URL,
headers={"Authorization": f"Bearer {openrouter_key}"},
timeout=10
headers={"Authorization": f"Bearer {key}"},
timeout=10,
)
if response.status_code == 200:
print(f"\r {color('', Colors.GREEN)} OpenRouter API ")
elif response.status_code == 401:
print(f"\r {color('', Colors.RED)} OpenRouter API {color('(invalid API key)', Colors.DIM)} ")
issues.append("Check OPENROUTER_API_KEY in .env")
elif response.status_code == 402:
print(f"\r {color('', Colors.RED)} OpenRouter API {color('(out of credits — payment required)', Colors.DIM)}")
issues.append(
"OpenRouter account has insufficient credits. "
"Fix: run 'hermes config set model.provider <provider>' to switch providers, "
"or fund your OpenRouter account at https://openrouter.ai/settings/credits"
if r.status_code == 200:
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.GREEN), "OpenRouter API", "")],
[],
)
elif response.status_code == 429:
print(f"\r {color('', Colors.RED)} OpenRouter API {color('(rate limited)', Colors.DIM)} ")
issues.append("OpenRouter rate limit hit — consider switching to a different provider or waiting")
else:
print(f"\r {color('', Colors.RED)} OpenRouter API {color(f'(HTTP {response.status_code})', Colors.DIM)} ")
if r.status_code == 401:
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.RED), "OpenRouter API",
color("(invalid API key)", Colors.DIM))],
["Check OPENROUTER_API_KEY in .env"],
)
if r.status_code == 402:
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.RED), "OpenRouter API",
color("(out of credits — payment required)", Colors.DIM))],
["OpenRouter account has insufficient credits. "
"Fix: run 'hermes config set model.provider <provider>' "
"to switch providers, or fund your OpenRouter account "
"at https://openrouter.ai/settings/credits"],
)
if r.status_code == 429:
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.RED), "OpenRouter API",
color("(rate limited)", Colors.DIM))],
["OpenRouter rate limit hit — consider switching to "
"a different provider or waiting"],
)
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.RED), "OpenRouter API",
color(f"(HTTP {r.status_code})", Colors.DIM))],
[],
)
except Exception as e:
print(f"\r {color('', Colors.RED)} OpenRouter API {color(f'({e})', Colors.DIM)} ")
issues.append("Check network connectivity")
else:
check_warn("OpenRouter API", "(not configured)")
from hermes_cli.auth import get_anthropic_key
anthropic_key = get_anthropic_key()
if anthropic_key:
print(" Checking Anthropic API...", end="", flush=True)
return _ConnectivityResult(
"OpenRouter API",
[(color("", Colors.RED), "OpenRouter API",
color(f"({e})", Colors.DIM))],
["Check network connectivity"],
)
def _probe_anthropic() -> _ConnectivityResult:
from hermes_cli.auth import get_anthropic_key
key = get_anthropic_key()
if not key:
return _ConnectivityResult("Anthropic API", [], [])
try:
import httpx
from agent.anthropic_adapter import (
@ -1212,140 +1260,247 @@ def run_doctor(args):
_OAUTH_ONLY_BETAS,
_CONTEXT_1M_BETA,
)
headers = {"anthropic-version": "2023-06-01"}
is_oauth = _is_oauth_token(anthropic_key)
is_oauth = _is_oauth_token(key)
if is_oauth:
headers["Authorization"] = f"Bearer {anthropic_key}"
headers["Authorization"] = f"Bearer {key}"
headers["anthropic-beta"] = ",".join(_COMMON_BETAS + _OAUTH_ONLY_BETAS)
else:
headers["x-api-key"] = anthropic_key
response = httpx.get(
headers["x-api-key"] = key
r = httpx.get(
"https://api.anthropic.com/v1/models",
headers=headers,
timeout=10
headers=headers, timeout=10,
)
# Reactive recovery: OAuth subscriptions that don't include 1M
# context reject the request with 400 "long context beta is not
# yet available for this subscription". Retry once with that
# beta stripped so the doctor check doesn't falsely report the
# Anthropic API as unreachable for those users.
# Reactive recovery: OAuth subscriptions without 1M context reject the
# request with 400 "long context beta is not yet available for this
# subscription". Retry once with that beta stripped so the doctor
# check doesn't falsely report Anthropic as unreachable.
if (
is_oauth
and response.status_code == 400
and "long context beta" in response.text.lower()
and "not yet available" in response.text.lower()
and r.status_code == 400
and "long context beta" in r.text.lower()
and "not yet available" in r.text.lower()
):
headers["anthropic-beta"] = ",".join(
[b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA] + list(_OAUTH_ONLY_BETAS)
[b for b in _COMMON_BETAS if b != _CONTEXT_1M_BETA]
+ list(_OAUTH_ONLY_BETAS)
)
response = httpx.get(
r = httpx.get(
"https://api.anthropic.com/v1/models",
headers=headers,
timeout=10,
headers=headers, timeout=10,
)
if response.status_code == 200:
print(f"\r {color('', Colors.GREEN)} Anthropic API ")
elif response.status_code == 401:
print(f"\r {color('', Colors.RED)} Anthropic API {color('(invalid API key)', Colors.DIM)} ")
else:
msg = "(couldn't verify)"
print(f"\r {color('', Colors.YELLOW)} Anthropic API {color(msg, Colors.DIM)} ")
if r.status_code == 200:
return _ConnectivityResult(
"Anthropic API",
[(color("", Colors.GREEN), "Anthropic API", "")],
[],
)
if r.status_code == 401:
return _ConnectivityResult(
"Anthropic API",
[(color("", Colors.RED), "Anthropic API",
color("(invalid API key)", Colors.DIM))],
[],
)
return _ConnectivityResult(
"Anthropic API",
[(color("", Colors.YELLOW), "Anthropic API",
color("(couldn't verify)", Colors.DIM))],
[],
)
except Exception as e:
print(f"\r {color('', Colors.YELLOW)} Anthropic API {color(f'({e})', Colors.DIM)} ")
return _ConnectivityResult(
"Anthropic API",
[(color("", Colors.YELLOW), "Anthropic API",
color(f"({e})", Colors.DIM))],
[],
)
def _probe_apikey_provider(pname, env_vars, default_url, base_env,
supports_health_check) -> _ConnectivityResult:
key = ""
for ev in env_vars:
key = os.getenv(ev, "")
if key:
break
if not key:
return _ConnectivityResult(pname, [], [])
label = pname.ljust(20)
if not supports_health_check:
return _ConnectivityResult(
pname,
[(color("", Colors.GREEN), label,
color("(key configured)", Colors.DIM))],
[],
)
try:
import httpx
base = os.getenv(base_env, "") if base_env else ""
# Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
# (OpenAI-compat surface, which exposes /models for health check).
if not base and key.startswith("sk-kimi-"):
base = "https://api.kimi.com/coding/v1"
# Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
# with no /v1) don't support /models. Rewrite to OpenAI-compat
# /v1 surface for health checks.
if base and base.rstrip("/").endswith("/anthropic"):
from agent.auxiliary_client import _to_openai_base_url
base = _to_openai_base_url(base)
if base_url_host_matches(base, "api.kimi.com") and base.rstrip("/").endswith("/coding"):
base = base.rstrip("/") + "/v1"
url = (base.rstrip("/") + "/models") if base else default_url
headers = {
"Authorization": f"Bearer {key}",
"User-Agent": _HERMES_USER_AGENT,
}
if base_url_host_matches(base, "api.kimi.com"):
headers["User-Agent"] = "claude-code/0.1.0"
r = httpx.get(url, headers=headers, timeout=10)
if (
pname == "Alibaba/DashScope"
and not base
and r.status_code == 401
):
r = httpx.get(
"https://dashscope.aliyuncs.com/compatible-mode/v1/models",
headers=headers, timeout=10,
)
if r.status_code == 200:
return _ConnectivityResult(
pname,
[(color("", Colors.GREEN), label, "")],
[],
)
if r.status_code == 401:
return _ConnectivityResult(
pname,
[(color("", Colors.RED), label,
color("(invalid API key)", Colors.DIM))],
[f"Check {env_vars[0]} in .env"],
)
return _ConnectivityResult(
pname,
[(color("", Colors.YELLOW), label,
color(f"(HTTP {r.status_code})", Colors.DIM))],
[],
)
except Exception as e:
return _ConnectivityResult(
pname,
[(color("", Colors.YELLOW), label,
color(f"({e})", Colors.DIM))],
[],
)
def _probe_bedrock() -> _ConnectivityResult:
try:
from agent.bedrock_adapter import (
has_aws_credentials,
resolve_aws_auth_env_var,
resolve_bedrock_region,
)
except ImportError:
return _ConnectivityResult("AWS Bedrock", [], [])
if not has_aws_credentials():
return _ConnectivityResult("AWS Bedrock", [], [])
auth_var = resolve_aws_auth_env_var()
region = resolve_bedrock_region()
label = "AWS Bedrock".ljust(20)
try:
import boto3
from botocore.config import Config as _BotoConfig
# Trim retries on the actual Bedrock API call so a transient
# failure doesn't pad the doctor run by 30+ seconds.
cfg = _BotoConfig(
connect_timeout=5,
read_timeout=10,
retries={"max_attempts": 1},
)
client = boto3.client("bedrock", region_name=region, config=cfg)
resp = client.list_foundation_models()
n = len(resp.get("modelSummaries", []))
return _ConnectivityResult(
"AWS Bedrock",
[(color("", Colors.GREEN), label,
color(f"({auth_var}, {region}, {n} models)", Colors.DIM))],
[],
)
except ImportError:
return _ConnectivityResult(
"AWS Bedrock",
[(color("", Colors.YELLOW), label,
color(f"(boto3 not installed — {sys.executable} -m pip install boto3)",
Colors.DIM))],
[f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3"],
)
except Exception as e:
err_name = type(e).__name__
return _ConnectivityResult(
"AWS Bedrock",
[(color("", Colors.YELLOW), label,
color(f"({err_name}: {e})", Colors.DIM))],
[f"AWS Bedrock: {err_name} — check IAM permissions for "
f"bedrock:ListFoundationModels"],
)
# Build the probe submission list in display order
_probes.append(("OpenRouter API", _probe_openrouter))
_probes.append(("Anthropic API", _probe_anthropic))
# -- API-key providers --
# Tuple: (name, env_vars, default_url, base_env, supports_models_endpoint)
# If supports_models_endpoint is False, we skip the health check and just show "configured"
# Cached at module level after first build — profiles auto-extend it.
global _APIKEY_PROVIDERS_CACHE
if _APIKEY_PROVIDERS_CACHE is None:
_APIKEY_PROVIDERS_CACHE = _build_apikey_providers_list()
_apikey_providers = _APIKEY_PROVIDERS_CACHE
for _pname, _env_vars, _default_url, _base_env, _supports_health_check in _apikey_providers:
_key = ""
for _ev in _env_vars:
_key = os.getenv(_ev, "")
if _key:
break
if _key:
_label = _pname.ljust(20)
# Some providers (like MiniMax) don't support /models endpoint
if not _supports_health_check:
print(f" {color('', Colors.GREEN)} {_label} {color('(key configured)', Colors.DIM)}")
continue
print(f" Checking {_pname} API...", end="", flush=True)
try:
import httpx
_base = os.getenv(_base_env, "") if _base_env else ""
# Auto-detect Kimi Code keys (sk-kimi-) → api.kimi.com/coding/v1
# (OpenAI-compat surface, which exposes /models for health check).
if not _base and _key.startswith("sk-kimi-"):
_base = "https://api.kimi.com/coding/v1"
# Anthropic-compat endpoints (/anthropic, api.kimi.com/coding
# with no /v1) don't support /models. Rewrite to the OpenAI-compat
# /v1 surface for health checks.
if _base and _base.rstrip("/").endswith("/anthropic"):
from agent.auxiliary_client import _to_openai_base_url
_base = _to_openai_base_url(_base)
if base_url_host_matches(_base, "api.kimi.com") and _base.rstrip("/").endswith("/coding"):
_base = _base.rstrip("/") + "/v1"
_url = (_base.rstrip("/") + "/models") if _base else _default_url
_headers = {
"Authorization": f"Bearer {_key}",
"User-Agent": _HERMES_USER_AGENT,
}
if base_url_host_matches(_base, "api.kimi.com"):
_headers["User-Agent"] = "claude-code/0.1.0"
_resp = httpx.get(
_url,
headers=_headers,
timeout=10,
)
if (
_pname == "Alibaba/DashScope"
and not _base
and _resp.status_code == 401
):
_resp = httpx.get(
"https://dashscope.aliyuncs.com/compatible-mode/v1/models",
headers=_headers,
timeout=10,
)
if _resp.status_code == 200:
print(f"\r {color('', Colors.GREEN)} {_label} ")
elif _resp.status_code == 401:
print(f"\r {color('', Colors.RED)} {_label} {color('(invalid API key)', Colors.DIM)} ")
issues.append(f"Check {_env_vars[0]} in .env")
else:
print(f"\r {color('', Colors.YELLOW)} {_label} {color(f'(HTTP {_resp.status_code})', Colors.DIM)} ")
except Exception as _e:
print(f"\r {color('', Colors.YELLOW)} {_label} {color(f'({_e})', Colors.DIM)} ")
for _entry in _APIKEY_PROVIDERS_CACHE:
_pname, _env_vars, _default_url, _base_env, _supports = _entry
# Capture loop vars by binding default args — without this, all closures
# would share the final iteration's values and every probe would hit
# the last provider's URL.
_probes.append((_pname, lambda p=_pname, e=_env_vars, u=_default_url,
b=_base_env, s=_supports:
_probe_apikey_provider(p, e, u, b, s)))
# -- AWS Bedrock --
# Bedrock uses the AWS SDK credential chain, not API keys.
_probes.append(("AWS Bedrock", _probe_bedrock))
# Print a single status line so users see something happening, then
# fan out. ``\r`` clears it once the first real result line lands.
print(f" {color(f'Running {len(_probes)} connectivity checks in parallel…', Colors.DIM)}",
end="", flush=True)
# Disable boto3's EC2 instance-metadata-service probe for the duration
# of the parallel block. boto's default credential chain tries
# 169.254.169.254 with a multi-second timeout when we're not on EC2,
# which dominated the section's wall time before this fix
# (~2s on a developer laptop, even with the rest parallelized).
# Set on the parent thread before submitting work so the env-var
# mutation never races with another worker. has_aws_credentials() in
# the bedrock probe already gates on real env-var creds, so IMDS is
# never the legitimate source for `hermes doctor`.
_imds_prev = os.environ.get("AWS_EC2_METADATA_DISABLED")
os.environ["AWS_EC2_METADATA_DISABLED"] = "true"
try:
from agent.bedrock_adapter import has_aws_credentials, resolve_aws_auth_env_var, resolve_bedrock_region
if has_aws_credentials():
_auth_var = resolve_aws_auth_env_var()
_region = resolve_bedrock_region()
_label = "AWS Bedrock".ljust(20)
print(f" Checking AWS Bedrock...", end="", flush=True)
try:
import boto3
_br_client = boto3.client("bedrock", region_name=_region)
_br_resp = _br_client.list_foundation_models()
_model_count = len(_br_resp.get("modelSummaries", []))
print(f"\r {color('', Colors.GREEN)} {_label} {color(f'({_auth_var}, {_region}, {_model_count} models)', Colors.DIM)} ")
except ImportError:
print(f"\r {color('', Colors.YELLOW)} {_label} {color(f'(boto3 not installed — {sys.executable} -m pip install boto3)', Colors.DIM)} ")
issues.append(f"Install boto3 for Bedrock: {sys.executable} -m pip install boto3")
except Exception as _e:
_err_name = type(_e).__name__
print(f"\r {color('', Colors.YELLOW)} {_label} {color(f'({_err_name}: {_e})', Colors.DIM)} ")
issues.append(f"AWS Bedrock: {_err_name} — check IAM permissions for bedrock:ListFoundationModels")
except ImportError:
pass # bedrock_adapter not available — skip silently
# 8 workers is plenty — each probe is a single HTTP call plus a TLS
# handshake. More than that wastes thread-startup cost and risks
# noisy output if anything ever printed from inside a worker.
with _futures.ThreadPoolExecutor(max_workers=8,
thread_name_prefix="doctor-probe") as _ex:
_futures_in_order = [_ex.submit(_fn) for _, _fn in _probes]
_results = [_f.result() for _f in _futures_in_order]
finally:
if _imds_prev is None:
os.environ.pop("AWS_EC2_METADATA_DISABLED", None)
else:
os.environ["AWS_EC2_METADATA_DISABLED"] = _imds_prev
# Clear the "Running …" line and print all results in submission order.
print("\r" + " " * 70 + "\r", end="")
for _r in _results:
for _glyph, _label, _detail in _r.lines:
if _detail:
print(f" {_glyph} {_label} {_detail}")
else:
print(f" {_glyph} {_label}")
for _issue in _r.issues:
issues.append(_issue)
# =========================================================================
# Check: Submodules