feat: expand /fast to all OpenAI Priority Processing models (#6960)

Previously /fast only supported gpt-5.4 and forced a provider switch to
openai-codex. Now supports all 13 models from OpenAI's Priority Processing
pricing table (gpt-5.4, gpt-5.4-mini, gpt-5.2, gpt-5.1, gpt-5, gpt-5-mini,
gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4o, gpt-4o-mini, o3, o4-mini).

Key changes:
- Replaced _FAST_MODE_BACKEND_CONFIG with _PRIORITY_PROCESSING_MODELS frozenset
- Removed provider-forcing logic — service_tier is now injected into whatever
  API path the user is already on (Codex Responses, Chat Completions, or
  OpenRouter passthrough)
- Added request_overrides support to chat_completions path in run_agent.py
- Updated messaging from 'Codex inference tier' to 'Priority Processing'
- Expanded test coverage for all supported models
This commit is contained in:
Teknium 2026-04-09 22:06:30 -07:00 committed by GitHub
parent d416a69288
commit 8394b5ddd2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 144 additions and 110 deletions

35
cli.py
View file

@ -2572,7 +2572,7 @@ class HermesCLI:
def _resolve_turn_agent_config(self, user_message: str) -> dict:
"""Resolve model/runtime overrides for a single user turn."""
from agent.smart_model_routing import resolve_turn_route
from hermes_cli.models import resolve_fast_mode_runtime
from hermes_cli.models import resolve_fast_mode_overrides
route = resolve_turn_route(
user_message,
@ -2595,27 +2595,10 @@ class HermesCLI:
return route
try:
fast_runtime = resolve_fast_mode_runtime(route.get("model"))
overrides = resolve_fast_mode_overrides(route.get("model"))
except Exception:
route["request_overrides"] = None
return route
if not fast_runtime:
route["request_overrides"] = None
return route
runtime = fast_runtime["runtime"]
route["runtime"] = runtime
route["request_overrides"] = fast_runtime["request_overrides"]
route["label"] = f"fast route → {route.get('model')} ({runtime.get('provider')})"
route["signature"] = (
route.get("model"),
runtime.get("provider"),
runtime.get("base_url"),
runtime.get("api_mode"),
runtime.get("command"),
tuple(runtime.get("args") or ()),
json.dumps(route["request_overrides"], sort_keys=True),
)
overrides = None
route["request_overrides"] = overrides
return route
def _init_agent(self, *, model_override: str = None, runtime_override: dict = None, route_label: str = None, request_overrides: dict | None = None) -> bool:
@ -5662,15 +5645,15 @@ class HermesCLI:
_cprint(f" {_GOLD}✓ Reasoning effort set to '{arg}' (session only){_RST}")
def _handle_fast_command(self, cmd: str):
"""Handle /fast — choose the Codex Responses service tier."""
"""Handle /fast — toggle OpenAI Priority Processing (service_tier)."""
if not self._fast_command_available():
_cprint(" (._.) /fast is only available for models that explicitly expose a fast backend.")
_cprint(" (._.) /fast is only available for OpenAI models that support Priority Processing.")
return
parts = cmd.strip().split(maxsplit=1)
if len(parts) < 2 or parts[1].strip().lower() == "status":
status = "fast" if self.service_tier == "priority" else "normal"
_cprint(f" {_GOLD}Codex inference tier: {status}{_RST}")
_cprint(f" {_GOLD}Priority Processing: {status}{_RST}")
_cprint(f" {_DIM}Usage: /fast [normal|fast|status]{_RST}")
return
@ -5691,9 +5674,9 @@ class HermesCLI:
self.agent = None # Force agent re-init with new service-tier config
if save_config_value("agent.service_tier", saved_value):
_cprint(f" {_GOLD}Codex inference tier set to {label} (saved to config){_RST}")
_cprint(f" {_GOLD}Priority Processing set to {label} (saved to config){_RST}")
else:
_cprint(f" {_GOLD}Codex inference tier set to {label} (session only){_RST}")
_cprint(f" {_GOLD}Priority Processing set to {label} (session only){_RST}")
def _on_reasoning(self, reasoning_text: str):
"""Callback for intermediate reasoning display during tool-call loops."""