mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-09 08:21:50 +00:00
feat(compression): raise compaction trigger to 85% for gpt-5.5 on Codex OAuth (#40957)
The ChatGPT Codex OAuth backend hard-caps gpt-5.5 at a 272K context window (verified live: a ~330K-token request to chatgpt.com/backend-api/codex/responses is rejected with context_length_exceeded while ~250K succeeds; the same slug exposes 1.05M on the direct OpenAI API / OpenRouter and 400K on Copilot). At the default 50% trigger, auto-compaction fires at ~136K — half the usable window. Raise the trigger to 85% (~231K) on this exact route only, gated by a new compression.codex_gpt55_autoraise config flag (default true). When it fires, emit a one-time notice (CLI inline print + gateway status_callback replay) with the exact opt-back-out command. gpt-5.5 on any other provider keeps the user's global threshold. - _is_codex_gpt55() matches the 5.5 family only on provider=openai-codex - _compression_threshold_for_model() now provider-aware + opt-out param - config key + _config_version bump (27->28) for backfill - docs + tests (40 cases in test_arcee_trinity_overrides.py)
This commit is contained in:
parent
2d099fed1e
commit
0524c9b34e
5 changed files with 218 additions and 4 deletions
|
|
@ -68,6 +68,24 @@ def _ra():
|
|||
return run_agent
|
||||
|
||||
|
||||
def _build_codex_gpt55_autoraise_notice(autoraise: Dict[str, float]) -> str:
|
||||
"""Build the one-time notice shown when Codex gpt-5.5 raises compaction.
|
||||
|
||||
``autoraise`` is ``{"from": <old_ratio>, "to": <new_ratio>}``. The same
|
||||
text is printed inline for CLI users and replayed via ``status_callback``
|
||||
for gateway users, so it must be self-contained and include the exact
|
||||
opt-back-out command.
|
||||
"""
|
||||
from_pct = int(round(autoraise["from"] * 100))
|
||||
to_pct = int(round(autoraise["to"] * 100))
|
||||
return (
|
||||
f"ℹ Codex gpt-5.5 caps context at 272K, so auto-compaction was raised "
|
||||
f"to {to_pct}% (from {from_pct}%) to use more of the window before "
|
||||
f"summarizing.\n"
|
||||
f" Opt back out: hermes config set compression.codex_gpt55_autoraise false"
|
||||
)
|
||||
|
||||
|
||||
def _normalized_custom_base_url(value: Any) -> str:
|
||||
if not isinstance(value, str):
|
||||
return ""
|
||||
|
|
@ -1240,11 +1258,41 @@ def init_agent(
|
|||
if not isinstance(_compression_cfg, dict):
|
||||
_compression_cfg = {}
|
||||
compression_threshold = float(_compression_cfg.get("threshold", 0.50))
|
||||
# Per-model/route compaction-threshold override. Codex gpt-5.5 raises to
|
||||
# 85% (the Codex backend caps the window at 272K, so the default 50% would
|
||||
# compact at ~136K — half the usable context). Gated by an opt-out config
|
||||
# flag so the user can fall back to the global threshold; when the override
|
||||
# fires we stash a one-time notification (replayed on the first turn) that
|
||||
# tells the user what changed and how to revert.
|
||||
_codex_gpt55_autoraise = str(
|
||||
_compression_cfg.get("codex_gpt55_autoraise", True)
|
||||
).lower() in {"true", "1", "yes"}
|
||||
agent._compression_threshold_autoraised = None
|
||||
try:
|
||||
from agent.auxiliary_client import _compression_threshold_for_model as _cthresh_fn
|
||||
_model_cthresh = _cthresh_fn(agent.model)
|
||||
from agent.auxiliary_client import (
|
||||
_compression_threshold_for_model as _cthresh_fn,
|
||||
_is_codex_gpt55 as _is_codex_gpt55_fn,
|
||||
)
|
||||
_model_cthresh = _cthresh_fn(
|
||||
agent.model,
|
||||
agent.provider,
|
||||
allow_codex_gpt55_autoraise=_codex_gpt55_autoraise,
|
||||
)
|
||||
if _model_cthresh is not None:
|
||||
_prev_threshold = compression_threshold
|
||||
compression_threshold = _model_cthresh
|
||||
# Notify only for the Codex gpt-5.5 autoraise (the Arcee Trinity
|
||||
# override is a long-standing silent default). Skip the notice when
|
||||
# the user's global threshold already meets/exceeds the raised
|
||||
# value, since nothing actually changed for them.
|
||||
if (
|
||||
_is_codex_gpt55_fn(agent.model, agent.provider)
|
||||
and _model_cthresh > _prev_threshold + 1e-9
|
||||
):
|
||||
agent._compression_threshold_autoraised = {
|
||||
"from": _prev_threshold,
|
||||
"to": _model_cthresh,
|
||||
}
|
||||
except Exception:
|
||||
pass
|
||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"}
|
||||
|
|
@ -1621,11 +1669,24 @@ def init_agent(
|
|||
print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {agent.context_compressor.threshold_tokens:,})")
|
||||
else:
|
||||
print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (auto-compression disabled)")
|
||||
# One-time notice when the Codex gpt-5.5 autoraise kicked in, with the
|
||||
# exact opt-back-out command. Printed inline at startup for CLI users;
|
||||
# gateway users get the same text replayed via _compression_warning on
|
||||
# turn 1 (set below, after the warning slot is initialized).
|
||||
_autoraise = getattr(agent, "_compression_threshold_autoraised", None)
|
||||
if _autoraise and compression_enabled:
|
||||
print(_build_codex_gpt55_autoraise_notice(_autoraise))
|
||||
|
||||
# Check immediately so CLI users see the warning at startup.
|
||||
# Gateway status_callback is not yet wired, so any warning is stored
|
||||
# in _compression_warning and replayed in the first run_conversation().
|
||||
agent._compression_warning = None
|
||||
# Gateway parity for the Codex gpt-5.5 autoraise notice: the startup print
|
||||
# above only reaches the CLI, so stash the same text here to be replayed
|
||||
# through status_callback on the first turn (Telegram/Discord/Slack/etc.).
|
||||
_autoraise = getattr(agent, "_compression_threshold_autoraised", None)
|
||||
if _autoraise and compression_enabled:
|
||||
agent._compression_warning = _build_codex_gpt55_autoraise_notice(_autoraise)
|
||||
# Lazy feasibility check: deferred to the first turn that approaches the
|
||||
# compression threshold. Running it eagerly here costs ~400ms cold (network
|
||||
# probe of the auxiliary provider chain + /models lookup) on every agent
|
||||
|
|
|
|||
|
|
@ -202,6 +202,35 @@ def _is_arcee_trinity_thinking(model: Optional[str]) -> bool:
|
|||
return bare == "trinity-large-thinking"
|
||||
|
||||
|
||||
# Context window enforced by ChatGPT's Codex OAuth backend for gpt-5.5.
|
||||
# The raw OpenAI API and OpenRouter expose 1.05M for the same slug, but the
|
||||
# Codex backend hard-caps at 272K (verified live: a ~330K-token request to
|
||||
# chatgpt.com/backend-api/codex/responses is rejected with
|
||||
# ``context_length_exceeded`` while ~250K succeeds). With a 272K ceiling the
|
||||
# default 50% compaction trigger fires at ~136K — wasteful, since the model
|
||||
# can hold far more raw context before summarization actually buys anything.
|
||||
# We raise the trigger to 85% (~231K) on this exact route so Codex gpt-5.5
|
||||
# sessions use the window they actually have.
|
||||
_CODEX_GPT55_COMPACTION_THRESHOLD = 0.85
|
||||
|
||||
|
||||
def _is_codex_gpt55(model: Optional[str], provider: Optional[str] = None) -> bool:
|
||||
"""True for gpt-5.5 accessed through the ChatGPT Codex OAuth backend.
|
||||
|
||||
Matches only the Codex OAuth route (provider ``openai-codex``), not the
|
||||
direct OpenAI API, OpenRouter, or GitHub Copilot paths — those expose a
|
||||
larger context window for the same slug and must keep the user's default
|
||||
compaction threshold. ``gpt-5.5-pro`` and dated snapshots
|
||||
(``gpt-5.5-2026-04-23``) are matched via prefix so the override tracks the
|
||||
family without re-listing every variant.
|
||||
"""
|
||||
prov = (provider or "").strip().lower()
|
||||
if prov != "openai-codex":
|
||||
return False
|
||||
bare = (model or "").strip().lower().rsplit("/", 1)[-1]
|
||||
return bare == "gpt-5.5" or bare.startswith("gpt-5.5-") or bare.startswith("gpt-5.5.")
|
||||
|
||||
|
||||
def _fixed_temperature_for_model(
|
||||
model: Optional[str],
|
||||
base_url: Optional[str] = None,
|
||||
|
|
@ -224,18 +253,32 @@ def _fixed_temperature_for_model(
|
|||
return None
|
||||
|
||||
|
||||
def _compression_threshold_for_model(model: Optional[str]) -> Optional[float]:
|
||||
def _compression_threshold_for_model(
|
||||
model: Optional[str],
|
||||
provider: Optional[str] = None,
|
||||
*,
|
||||
allow_codex_gpt55_autoraise: bool = True,
|
||||
) -> Optional[float]:
|
||||
"""Return a context-compression threshold override for specific models.
|
||||
|
||||
The threshold is the fraction of the model's context window that must be
|
||||
consumed before Hermes triggers summarization. Higher values delay
|
||||
compression and preserve more raw context.
|
||||
|
||||
Per-model/route overrides:
|
||||
- Arcee Trinity Large Thinking → 0.75 (preserve reasoning context).
|
||||
- gpt-5.5 on the Codex OAuth route → 0.85, because Codex caps the window
|
||||
at 272K and the default 50% trigger would compact at ~136K. Gated by
|
||||
``allow_codex_gpt55_autoraise`` so the user can opt back down to the
|
||||
global default (the caller passes the config flag through here).
|
||||
|
||||
Returns a float in (0, 1] to override the global ``compression.threshold``
|
||||
config value, or ``None`` to leave the user's config value unchanged.
|
||||
"""
|
||||
if _is_arcee_trinity_thinking(model):
|
||||
return 0.75
|
||||
if allow_codex_gpt55_autoraise and _is_codex_gpt55(model, provider):
|
||||
return _CODEX_GPT55_COMPACTION_THRESHOLD
|
||||
return None
|
||||
|
||||
# Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
|
||||
|
|
|
|||
|
|
@ -1139,6 +1139,16 @@ DEFAULT_CONFIG = {
|
|||
# Default False matches historical behavior; set to
|
||||
# True if you'd rather pause than silently lose
|
||||
# context turns when your aux model is flaky.
|
||||
"codex_gpt55_autoraise": True, # When True, gpt-5.5 on the ChatGPT Codex OAuth
|
||||
# route raises its compaction trigger to 85% (vs the
|
||||
# global `threshold` above). Codex hard-caps gpt-5.5
|
||||
# at a 272K window, so the default 50% would compact
|
||||
# at ~136K and waste half the usable context. Set to
|
||||
# False to opt back down to the global threshold
|
||||
# (e.g. 0.50) for Codex gpt-5.5 sessions. Only this
|
||||
# exact route is affected — gpt-5.5 on OpenAI's
|
||||
# direct API, OpenRouter, and Copilot keep the
|
||||
# global threshold regardless.
|
||||
},
|
||||
|
||||
# Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
|
||||
|
|
@ -2420,7 +2430,7 @@ DEFAULT_CONFIG = {
|
|||
|
||||
|
||||
# Config schema version - bump this when adding new required fields
|
||||
"_config_version": 27,
|
||||
"_config_version": 28,
|
||||
}
|
||||
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ from agent.auxiliary_client import (
|
|||
_compression_threshold_for_model,
|
||||
_fixed_temperature_for_model,
|
||||
_is_arcee_trinity_thinking,
|
||||
_is_codex_gpt55,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -74,3 +75,85 @@ def test_compression_threshold_default_none_for_other_models() -> None:
|
|||
assert _compression_threshold_for_model("trinity-large-preview") is None
|
||||
assert _compression_threshold_for_model("claude-sonnet-4.6") is None
|
||||
assert _compression_threshold_for_model("kimi-k2") is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Codex gpt-5.5 compaction-threshold autoraise
|
||||
#
|
||||
# ChatGPT's Codex OAuth backend caps gpt-5.5 at a 272K window (verified live:
|
||||
# ~330K-token request rejected with context_length_exceeded, ~250K accepted).
|
||||
# The default 50% compaction trigger would fire at ~136K — half the usable
|
||||
# window — so this route raises the trigger to 85%. Only the Codex OAuth route
|
||||
# is affected; the same slug on OpenAI direct / OpenRouter / Copilot exposes a
|
||||
# larger window and keeps the user's global threshold.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
[
|
||||
"gpt-5.5",
|
||||
"gpt-5.5-pro",
|
||||
"gpt-5.5-2026-04-23", # dated snapshot
|
||||
"gpt-5.5-codex-mini", # Codex variant of the 5.5 family (also 272K-capped)
|
||||
"openai/gpt-5.5", # aggregator-prefixed (still on the codex route)
|
||||
"GPT-5.5", # case-insensitive
|
||||
" gpt-5.5 ", # whitespace tolerant
|
||||
],
|
||||
)
|
||||
def test_is_codex_gpt55_matches_on_codex_provider(model: str) -> None:
|
||||
assert _is_codex_gpt55(model, "openai-codex") is True
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"provider",
|
||||
["openrouter", "openai", "copilot", "openai-api", "", None],
|
||||
)
|
||||
def test_is_codex_gpt55_rejects_non_codex_providers(provider) -> None:
|
||||
# gpt-5.5 on any non-Codex route keeps the larger window — no override.
|
||||
assert _is_codex_gpt55("gpt-5.5", provider) is False
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model",
|
||||
["gpt-5.4", "gpt-5", "gpt-5.55", "gpt-5.50", "", None],
|
||||
)
|
||||
def test_is_codex_gpt55_rejects_non_55_models(model) -> None:
|
||||
# gpt-5.55 / gpt-5.50 are different families and must NOT match — the
|
||||
# "gpt-5.5-" / "gpt-5.5." prefix guards require a separator after "5.5".
|
||||
assert _is_codex_gpt55(model, "openai-codex") is False
|
||||
|
||||
|
||||
def test_compression_threshold_for_codex_gpt55() -> None:
|
||||
assert _compression_threshold_for_model("gpt-5.5", "openai-codex") == 0.85
|
||||
assert _compression_threshold_for_model("gpt-5.5-pro", "openai-codex") == 0.85
|
||||
assert _compression_threshold_for_model("openai/gpt-5.5", "openai-codex") == 0.85
|
||||
|
||||
|
||||
def test_compression_threshold_codex_gpt55_other_routes_unaffected() -> None:
|
||||
# Same slug, different route → no override (keep the user's config value).
|
||||
assert _compression_threshold_for_model("gpt-5.5", "openrouter") is None
|
||||
assert _compression_threshold_for_model("gpt-5.5", "openai") is None
|
||||
assert _compression_threshold_for_model("gpt-5.5", "copilot") is None
|
||||
assert _compression_threshold_for_model("openai/gpt-5.5") is None # no provider
|
||||
|
||||
|
||||
def test_compression_threshold_codex_gpt55_opt_out() -> None:
|
||||
# allow_codex_gpt55_autoraise=False reverts to the global default (None).
|
||||
assert (
|
||||
_compression_threshold_for_model(
|
||||
"gpt-5.5", "openai-codex", allow_codex_gpt55_autoraise=False
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
|
||||
def test_compression_threshold_opt_out_does_not_disable_trinity() -> None:
|
||||
# The opt-out flag is scoped to the Codex gpt-5.5 autoraise; the Arcee
|
||||
# Trinity override must still apply when the flag is False.
|
||||
assert (
|
||||
_compression_threshold_for_model(
|
||||
"trinity-large-thinking", "openrouter", allow_codex_gpt55_autoraise=False
|
||||
)
|
||||
== 0.75
|
||||
)
|
||||
|
|
|
|||
|
|
@ -84,6 +84,7 @@ compression:
|
|||
threshold: 0.50 # Fraction of context window (default: 0.50 = 50%)
|
||||
target_ratio: 0.20 # How much of threshold to keep as tail (default: 0.20)
|
||||
protect_last_n: 20 # Minimum protected tail messages (default: 20)
|
||||
codex_gpt55_autoraise: true # gpt-5.5 on Codex OAuth: raise trigger to 85% (default: true)
|
||||
|
||||
# Summarization model/provider configured under auxiliary:
|
||||
auxiliary:
|
||||
|
|
@ -101,6 +102,22 @@ auxiliary:
|
|||
| `target_ratio` | `0.20` | 0.10-0.80 | Controls tail protection token budget: `threshold_tokens × target_ratio` |
|
||||
| `protect_last_n` | `20` | ≥1 | Minimum number of recent messages always preserved |
|
||||
| `protect_first_n` | `3` | (hardcoded) | System prompt + first exchange always preserved |
|
||||
| `codex_gpt55_autoraise` | `true` | bool | Raise the trigger to 85% for gpt-5.5 on the ChatGPT Codex OAuth route (see below). Set `false` to keep the global `threshold` |
|
||||
|
||||
### Codex gpt-5.5 threshold autoraise
|
||||
|
||||
The ChatGPT Codex OAuth backend hard-caps gpt-5.5 at a **272K** context window
|
||||
(the same slug exposes 1.05M on OpenAI's direct API and OpenRouter, and 400K on
|
||||
GitHub Copilot). At the default 50% trigger, compaction would fire at ~136K —
|
||||
half the window the model can actually use. When the active route is Codex
|
||||
OAuth (`provider: openai-codex`) and the model is gpt-5.5, Hermes raises the
|
||||
trigger to **85%** (~231K) and prints a one-time notice with the opt-out
|
||||
command. Only this exact route is affected; gpt-5.5 on any other provider keeps
|
||||
your global `threshold`. To opt back down to the global value:
|
||||
|
||||
```bash
|
||||
hermes config set compression.codex_gpt55_autoraise false
|
||||
```
|
||||
|
||||
### Computed Values (for a 200K context model at defaults)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue