fix(mcp-oauth): persist OAuth server metadata across process restarts (#21226)

The MCP SDK discovers OAuth server metadata (token_endpoint, etc.) on
demand and keeps it in memory only. Without disk persistence, a restart
with valid cached refresh tokens forces the SDK to fall back to the
guessed '{server_url}/token' path — which returns 404 on most real
providers (Notion, Atlassian, GitHub remote MCP, etc.) and triggers a
full browser re-authorization even though the refresh token is fine.

Add a .meta.json file next to the existing tokens/client_info files:

  HERMES_HOME/mcp-tokens/<server>.json        -- tokens (existing)
  HERMES_HOME/mcp-tokens/<server>.client.json -- client info (existing)
  HERMES_HOME/mcp-tokens/<server>.meta.json   -- oauth metadata (new)

Changes:
- HermesTokenStorage.save_oauth_metadata / load_oauth_metadata / _meta_path
  — disk layer for the discovered OAuthMetadata.
- HermesTokenStorage.remove() now also clears .meta.json so
  'hermes mcp remove <name>' and the manager's remove() path clean up fully.
- HermesMCPOAuthProvider._initialize cold-restores from disk before the
  existing pre-flight discovery runs. If disk has metadata we skip the
  discovery HTTP round-trips entirely.
- HermesMCPOAuthProvider._prefetch_oauth_metadata now persists ASM as
  soon as it's discovered, so even the first pre-flight run seeds disk.
- HermesMCPOAuthProvider._persist_oauth_metadata_if_changed() is called
  at the end of async_auth_flow so metadata discovered via the SDK's
  lazy 401-branch (not pre-flight) is also saved for next time.

Tests cover the storage roundtrip (save/load/missing/corrupt/remove) and
the manager provider path (cold-load restore, skip-when-in-memory,
persist-on-discover, noop-when-unchanged, end-to-end async_auth_flow).

Co-authored-by: nocturnum91 <50326054+nocturnum91@users.noreply.github.com>
This commit is contained in:
Teknium 2026-05-07 05:35:33 -07:00 committed by GitHub
parent 3c439ec681
commit c4a7992317
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 293 additions and 1 deletions

View file

@ -61,6 +61,7 @@ try:
from mcp.shared.auth import (
OAuthClientInformationFull,
OAuthClientMetadata,
OAuthMetadata,
OAuthToken,
)
@ -212,6 +213,7 @@ class HermesTokenStorage:
HERMES_HOME/mcp-tokens/<server_name>.json -- tokens
HERMES_HOME/mcp-tokens/<server_name>.client.json -- client info
HERMES_HOME/mcp-tokens/<server_name>.meta.json -- oauth server metadata
"""
def __init__(self, server_name: str):
@ -223,6 +225,9 @@ class HermesTokenStorage:
def _client_info_path(self) -> Path:
return _get_token_dir() / f"{self._server_name}.client.json"
def _meta_path(self) -> Path:
return _get_token_dir() / f"{self._server_name}.meta.json"
# -- tokens ------------------------------------------------------------
async def get_tokens(self) -> "OAuthToken | None":
@ -300,11 +305,33 @@ class HermesTokenStorage:
_write_json(self._client_info_path(), client_info.model_dump(mode="json", exclude_none=True))
logger.debug("OAuth client info saved for %s", self._server_name)
# -- oauth server metadata --------------------------------------------
# The MCP SDK keeps discovered ``OAuthMetadata`` (token endpoint URL,
# etc.) in memory only. Persisting it here lets a restarted process
# refresh tokens without re-running metadata discovery. Without this,
# cold-start refresh requests fall back to the SDK's guessed
# ``{server_url}/token`` which returns 404 on most real providers and
# forces a full browser re-authorization.
def save_oauth_metadata(self, metadata: "OAuthMetadata") -> None:
_write_json(self._meta_path(), metadata.model_dump(exclude_none=True, mode="json"))
logger.debug("OAuth metadata saved for %s", self._server_name)
def load_oauth_metadata(self) -> "OAuthMetadata | None":
data = _read_json(self._meta_path())
if data is None:
return None
try:
return OAuthMetadata.model_validate(data)
except (ValueError, TypeError, KeyError) as exc:
logger.warning("Corrupt OAuth metadata at %s -- ignoring: %s", self._meta_path(), exc)
return None
# -- cleanup -----------------------------------------------------------
def remove(self) -> None:
"""Delete all stored OAuth state for this server."""
for p in (self._tokens_path(), self._client_info_path()):
for p in (self._tokens_path(), self._client_info_path(), self._meta_path()):
p.unlink(missing_ok=True)
def has_cached_tokens(self) -> bool:

View file

@ -148,6 +148,27 @@ def _make_hermes_provider_class() -> Optional[type]:
if tokens is not None and tokens.expires_in is not None:
self.context.update_token_expiry(tokens)
# Cold-load: restore OAuth server metadata from disk before any
# refresh attempt. Without this, a restarted process with cached
# tokens but no in-memory metadata would fall back to the SDK's
# guessed ``{server_url}/token`` path (returns 404 on most real
# providers) and require a full browser re-authorization.
storage = self.context.storage
from tools.mcp_oauth import HermesTokenStorage
if (
isinstance(storage, HermesTokenStorage)
and self.context.oauth_metadata is None
):
meta = storage.load_oauth_metadata()
if meta is not None:
self.context.oauth_metadata = meta
logger.debug(
"MCP OAuth '%s': restored metadata from disk "
"(token_endpoint=%s)",
self._hermes_server_name,
meta.token_endpoint,
)
# Pre-flight OAuth AS discovery so ``_refresh_token`` has a
# correct ``token_endpoint`` before the first refresh attempt.
# Only runs when we have tokens on cold-load but no cached
@ -229,6 +250,12 @@ def _make_hermes_provider_class() -> Optional[type]:
break
if asm:
self.context.oauth_metadata = asm
# Persist immediately so a subsequent cold-load can
# skip discovery entirely.
storage = self.context.storage
from tools.mcp_oauth import HermesTokenStorage
if isinstance(storage, HermesTokenStorage):
storage.save_oauth_metadata(asm)
logger.debug(
"MCP OAuth '%s': pre-flight ASM discovered "
"token_endpoint=%s",
@ -236,6 +263,27 @@ def _make_hermes_provider_class() -> Optional[type]:
)
break
def _persist_oauth_metadata_if_changed(self) -> None:
"""Persist discovered OAuth metadata for future process restarts.
Called after the SDK's normal 401-branch auth flow completes so
metadata discovered via the lazy path (not pre-flight) is also
saved. No-op when nothing to persist or metadata hasn't changed.
"""
meta = self.context.oauth_metadata
if meta is None:
return
storage = self.context.storage
from tools.mcp_oauth import HermesTokenStorage
if not isinstance(storage, HermesTokenStorage):
return
existing = storage.load_oauth_metadata()
if (
existing is None
or str(existing.token_endpoint) != str(meta.token_endpoint)
):
storage.save_oauth_metadata(meta)
async def async_auth_flow(self, request): # type: ignore[override]
# Pre-flow hook: ask the manager to refresh from disk if needed.
# Any failure here is non-fatal — we just log and proceed with
@ -271,6 +319,9 @@ def _make_hermes_provider_class() -> Optional[type]:
incoming = yield outgoing
outgoing = await inner.asend(incoming)
except StopAsyncIteration:
# Persist any metadata the SDK discovered lazily during the
# 401 branch so a subsequent cold-load skips discovery.
self._persist_oauth_metadata_if_changed()
return
return HermesMCPOAuthProvider