feat(compression): raise compaction trigger to 85% for gpt-5.5 on Codex OAuth (#40957)

The ChatGPT Codex OAuth backend hard-caps gpt-5.5 at a 272K context window (verified live: a ~330K-token request to chatgpt.com/backend-api/codex/responses is rejected with context_length_exceeded while ~250K succeeds; the same slug exposes 1.05M on the direct OpenAI API / OpenRouter and 400K on Copilot). At the default 50% trigger, auto-compaction fires at ~136K — half the usable window. Raise the trigger to 85% (~231K) on this exact route only, gated by a new compression.codex_gpt55_autoraise config flag (default true). When it fires, emit a one-time notice (CLI inline print + gateway status_callback replay) with the exact opt-back-out command. gpt-5.5 on any other provider keeps the user's global threshold. - _is_codex_gpt55() matches the 5.5 family only on provider=openai-codex - _compression_threshold_for_model() now provider-aware + opt-out param - config key + _config_version bump (27->28) for backfill - docs + tests (40 cases in test_arcee_trinity_overrides.py)
2026-06-09 08:21:50 +00:00 · 2026-06-07 01:40:50 -07:00 · 2026-06-07 01:40:50 -07:00 · 0524c9b34e
commit 0524c9b34e
parent 2d099fed1e
5 changed files with 218 additions and 4 deletions
--- a/agent/agent_init.py
+++ b/agent/agent_init.py
@ -68,6 +68,24 @@ def _ra():
    return run_agent


+def _build_codex_gpt55_autoraise_notice(autoraise: Dict[str, float]) -> str:
+    """Build the one-time notice shown when Codex gpt-5.5 raises compaction.
+
+    ``autoraise`` is ``{"from": <old_ratio>, "to": <new_ratio>}``. The same
+    text is printed inline for CLI users and replayed via ``status_callback``
+    for gateway users, so it must be self-contained and include the exact
+    opt-back-out command.
+    """
+    from_pct = int(round(autoraise["from"] * 100))
+    to_pct = int(round(autoraise["to"] * 100))
+    return (
+        f"ℹ Codex gpt-5.5 caps context at 272K, so auto-compaction was raised "
+        f"to {to_pct}% (from {from_pct}%) to use more of the window before "
+        f"summarizing.\n"
+        f"  Opt back out: hermes config set compression.codex_gpt55_autoraise false"
+    )
+
+
 def _normalized_custom_base_url(value: Any) -> str:
    if not isinstance(value, str):
        return ""
@ -1240,11 +1258,41 @@ def init_agent(
    if not isinstance(_compression_cfg, dict):
        _compression_cfg = {}
    compression_threshold = float(_compression_cfg.get("threshold", 0.50))
+    # Per-model/route compaction-threshold override. Codex gpt-5.5 raises to
+    # 85% (the Codex backend caps the window at 272K, so the default 50% would
+    # compact at ~136K — half the usable context). Gated by an opt-out config
+    # flag so the user can fall back to the global threshold; when the override
+    # fires we stash a one-time notification (replayed on the first turn) that
+    # tells the user what changed and how to revert.
+    _codex_gpt55_autoraise = str(
+        _compression_cfg.get("codex_gpt55_autoraise", True)
+    ).lower() in {"true", "1", "yes"}
+    agent._compression_threshold_autoraised = None
    try:
-        from agent.auxiliary_client import _compression_threshold_for_model as _cthresh_fn
-        _model_cthresh = _cthresh_fn(agent.model)
+        from agent.auxiliary_client import (
+            _compression_threshold_for_model as _cthresh_fn,
+            _is_codex_gpt55 as _is_codex_gpt55_fn,
+        )
+        _model_cthresh = _cthresh_fn(
+            agent.model,
+            agent.provider,
+            allow_codex_gpt55_autoraise=_codex_gpt55_autoraise,
+        )
        if _model_cthresh is not None:
+            _prev_threshold = compression_threshold
            compression_threshold = _model_cthresh
+            # Notify only for the Codex gpt-5.5 autoraise (the Arcee Trinity
+            # override is a long-standing silent default). Skip the notice when
+            # the user's global threshold already meets/exceeds the raised
+            # value, since nothing actually changed for them.
+            if (
+                _is_codex_gpt55_fn(agent.model, agent.provider)
+                and _model_cthresh > _prev_threshold + 1e-9
+            ):
+                agent._compression_threshold_autoraised = {
+                    "from": _prev_threshold,
+                    "to": _model_cthresh,
+                }
    except Exception:
        pass
    compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in {"true", "1", "yes"}
@ -1621,11 +1669,24 @@ def init_agent(
            print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {agent.context_compressor.threshold_tokens:,})")
        else:
            print(f"📊 Context limit: {agent.context_compressor.context_length:,} tokens (auto-compression disabled)")
+        # One-time notice when the Codex gpt-5.5 autoraise kicked in, with the
+        # exact opt-back-out command. Printed inline at startup for CLI users;
+        # gateway users get the same text replayed via _compression_warning on
+        # turn 1 (set below, after the warning slot is initialized).
+        _autoraise = getattr(agent, "_compression_threshold_autoraised", None)
+        if _autoraise and compression_enabled:
+            print(_build_codex_gpt55_autoraise_notice(_autoraise))

    # Check immediately so CLI users see the warning at startup.
    # Gateway status_callback is not yet wired, so any warning is stored
    # in _compression_warning and replayed in the first run_conversation().
    agent._compression_warning = None
+    # Gateway parity for the Codex gpt-5.5 autoraise notice: the startup print
+    # above only reaches the CLI, so stash the same text here to be replayed
+    # through status_callback on the first turn (Telegram/Discord/Slack/etc.).
+    _autoraise = getattr(agent, "_compression_threshold_autoraised", None)
+    if _autoraise and compression_enabled:
+        agent._compression_warning = _build_codex_gpt55_autoraise_notice(_autoraise)
    # Lazy feasibility check: deferred to the first turn that approaches the
    # compression threshold. Running it eagerly here costs ~400ms cold (network
    # probe of the auxiliary provider chain + /models lookup) on every agent
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -202,6 +202,35 @@ def _is_arcee_trinity_thinking(model: Optional[str]) -> bool:
    return bare == "trinity-large-thinking"


+# Context window enforced by ChatGPT's Codex OAuth backend for gpt-5.5.
+# The raw OpenAI API and OpenRouter expose 1.05M for the same slug, but the
+# Codex backend hard-caps at 272K (verified live: a ~330K-token request to
+# chatgpt.com/backend-api/codex/responses is rejected with
+# ``context_length_exceeded`` while ~250K succeeds). With a 272K ceiling the
+# default 50% compaction trigger fires at ~136K — wasteful, since the model
+# can hold far more raw context before summarization actually buys anything.
+# We raise the trigger to 85% (~231K) on this exact route so Codex gpt-5.5
+# sessions use the window they actually have.
+_CODEX_GPT55_COMPACTION_THRESHOLD = 0.85
+
+
+def _is_codex_gpt55(model: Optional[str], provider: Optional[str] = None) -> bool:
+    """True for gpt-5.5 accessed through the ChatGPT Codex OAuth backend.
+
+    Matches only the Codex OAuth route (provider ``openai-codex``), not the
+    direct OpenAI API, OpenRouter, or GitHub Copilot paths — those expose a
+    larger context window for the same slug and must keep the user's default
+    compaction threshold. ``gpt-5.5-pro`` and dated snapshots
+    (``gpt-5.5-2026-04-23``) are matched via prefix so the override tracks the
+    family without re-listing every variant.
+    """
+    prov = (provider or "").strip().lower()
+    if prov != "openai-codex":
+        return False
+    bare = (model or "").strip().lower().rsplit("/", 1)[-1]
+    return bare == "gpt-5.5" or bare.startswith("gpt-5.5-") or bare.startswith("gpt-5.5.")
+
+
 def _fixed_temperature_for_model(
    model: Optional[str],
    base_url: Optional[str] = None,
@ -224,18 +253,32 @@ def _fixed_temperature_for_model(
    return None


-def _compression_threshold_for_model(model: Optional[str]) -> Optional[float]:
+def _compression_threshold_for_model(
+    model: Optional[str],
+    provider: Optional[str] = None,
+    *,
+    allow_codex_gpt55_autoraise: bool = True,
+) -> Optional[float]:
    """Return a context-compression threshold override for specific models.

    The threshold is the fraction of the model's context window that must be
    consumed before Hermes triggers summarization.  Higher values delay
    compression and preserve more raw context.

+    Per-model/route overrides:
+      - Arcee Trinity Large Thinking → 0.75 (preserve reasoning context).
+      - gpt-5.5 on the Codex OAuth route → 0.85, because Codex caps the window
+        at 272K and the default 50% trigger would compact at ~136K. Gated by
+        ``allow_codex_gpt55_autoraise`` so the user can opt back down to the
+        global default (the caller passes the config flag through here).
+
    Returns a float in (0, 1] to override the global ``compression.threshold``
    config value, or ``None`` to leave the user's config value unchanged.
    """
    if _is_arcee_trinity_thinking(model):
        return 0.75
+    if allow_codex_gpt55_autoraise and _is_codex_gpt55(model, provider):
+        return _CODEX_GPT55_COMPACTION_THRESHOLD
    return None

 # Default auxiliary models for direct API-key providers (cheap/fast for side tasks)
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -1139,6 +1139,16 @@ DEFAULT_CONFIG = {
                                      # Default False matches historical behavior; set to
                                      # True if you'd rather pause than silently lose
                                      # context turns when your aux model is flaky.
+        "codex_gpt55_autoraise": True,  # When True, gpt-5.5 on the ChatGPT Codex OAuth
+                                      # route raises its compaction trigger to 85% (vs the
+                                      # global `threshold` above). Codex hard-caps gpt-5.5
+                                      # at a 272K window, so the default 50% would compact
+                                      # at ~136K and waste half the usable context. Set to
+                                      # False to opt back down to the global threshold
+                                      # (e.g. 0.50) for Codex gpt-5.5 sessions. Only this
+                                      # exact route is affected — gpt-5.5 on OpenAI's
+                                      # direct API, OpenRouter, and Copilot keep the
+                                      # global threshold regardless.
    },

    # Anthropic prompt caching (Claude via OpenRouter or native Anthropic API).
@ -2420,7 +2430,7 @@ DEFAULT_CONFIG = {


    # Config schema version - bump this when adding new required fields
-    "_config_version": 27,
+    "_config_version": 28,
 }

 # =============================================================================
--- a/tests/agent/test_arcee_trinity_overrides.py
+++ b/tests/agent/test_arcee_trinity_overrides.py
@ -17,6 +17,7 @@ from agent.auxiliary_client import (
    _compression_threshold_for_model,
    _fixed_temperature_for_model,
    _is_arcee_trinity_thinking,
+    _is_codex_gpt55,
 )


@ -74,3 +75,85 @@ def test_compression_threshold_default_none_for_other_models() -> None:
    assert _compression_threshold_for_model("trinity-large-preview") is None
    assert _compression_threshold_for_model("claude-sonnet-4.6") is None
    assert _compression_threshold_for_model("kimi-k2") is None
+
+
+# ---------------------------------------------------------------------------
+# Codex gpt-5.5 compaction-threshold autoraise
+#
+# ChatGPT's Codex OAuth backend caps gpt-5.5 at a 272K window (verified live:
+# ~330K-token request rejected with context_length_exceeded, ~250K accepted).
+# The default 50% compaction trigger would fire at ~136K — half the usable
+# window — so this route raises the trigger to 85%. Only the Codex OAuth route
+# is affected; the same slug on OpenAI direct / OpenRouter / Copilot exposes a
+# larger window and keeps the user's global threshold.
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "gpt-5.5",
+        "gpt-5.5-pro",
+        "gpt-5.5-2026-04-23",  # dated snapshot
+        "gpt-5.5-codex-mini",  # Codex variant of the 5.5 family (also 272K-capped)
+        "openai/gpt-5.5",  # aggregator-prefixed (still on the codex route)
+        "GPT-5.5",  # case-insensitive
+        "  gpt-5.5  ",  # whitespace tolerant
+    ],
+)
+def test_is_codex_gpt55_matches_on_codex_provider(model: str) -> None:
+    assert _is_codex_gpt55(model, "openai-codex") is True
+
+
+@pytest.mark.parametrize(
+    "provider",
+    ["openrouter", "openai", "copilot", "openai-api", "", None],
+)
+def test_is_codex_gpt55_rejects_non_codex_providers(provider) -> None:
+    # gpt-5.5 on any non-Codex route keeps the larger window — no override.
+    assert _is_codex_gpt55("gpt-5.5", provider) is False
+
+
+@pytest.mark.parametrize(
+    "model",
+    ["gpt-5.4", "gpt-5", "gpt-5.55", "gpt-5.50", "", None],
+)
+def test_is_codex_gpt55_rejects_non_55_models(model) -> None:
+    # gpt-5.55 / gpt-5.50 are different families and must NOT match — the
+    # "gpt-5.5-" / "gpt-5.5." prefix guards require a separator after "5.5".
+    assert _is_codex_gpt55(model, "openai-codex") is False
+
+
+def test_compression_threshold_for_codex_gpt55() -> None:
+    assert _compression_threshold_for_model("gpt-5.5", "openai-codex") == 0.85
+    assert _compression_threshold_for_model("gpt-5.5-pro", "openai-codex") == 0.85
+    assert _compression_threshold_for_model("openai/gpt-5.5", "openai-codex") == 0.85
+
+
+def test_compression_threshold_codex_gpt55_other_routes_unaffected() -> None:
+    # Same slug, different route → no override (keep the user's config value).
+    assert _compression_threshold_for_model("gpt-5.5", "openrouter") is None
+    assert _compression_threshold_for_model("gpt-5.5", "openai") is None
+    assert _compression_threshold_for_model("gpt-5.5", "copilot") is None
+    assert _compression_threshold_for_model("openai/gpt-5.5") is None  # no provider
+
+
+def test_compression_threshold_codex_gpt55_opt_out() -> None:
+    # allow_codex_gpt55_autoraise=False reverts to the global default (None).
+    assert (
+        _compression_threshold_for_model(
+            "gpt-5.5", "openai-codex", allow_codex_gpt55_autoraise=False
+        )
+        is None
+    )
+
+
+def test_compression_threshold_opt_out_does_not_disable_trinity() -> None:
+    # The opt-out flag is scoped to the Codex gpt-5.5 autoraise; the Arcee
+    # Trinity override must still apply when the flag is False.
+    assert (
+        _compression_threshold_for_model(
+            "trinity-large-thinking", "openrouter", allow_codex_gpt55_autoraise=False
+        )
+        == 0.75
+    )
--- a/website/docs/developer-guide/context-compression-and-caching.md
+++ b/website/docs/developer-guide/context-compression-and-caching.md
@ -84,6 +84,7 @@ compression:
  threshold: 0.50            # Fraction of context window (default: 0.50 = 50%)
  target_ratio: 0.20         # How much of threshold to keep as tail (default: 0.20)
  protect_last_n: 20         # Minimum protected tail messages (default: 20)
+  codex_gpt55_autoraise: true  # gpt-5.5 on Codex OAuth: raise trigger to 85% (default: true)

 # Summarization model/provider configured under auxiliary:
 auxiliary:
@ -101,6 +102,22 @@ auxiliary:
 | `target_ratio` | `0.20` | 0.10-0.80 | Controls tail protection token budget: `threshold_tokens × target_ratio` |
 | `protect_last_n` | `20` | ≥1 | Minimum number of recent messages always preserved |
 | `protect_first_n` | `3` | (hardcoded) | System prompt + first exchange always preserved |
+| `codex_gpt55_autoraise` | `true` | bool | Raise the trigger to 85% for gpt-5.5 on the ChatGPT Codex OAuth route (see below). Set `false` to keep the global `threshold` |
+
+### Codex gpt-5.5 threshold autoraise
+
+The ChatGPT Codex OAuth backend hard-caps gpt-5.5 at a **272K** context window
+(the same slug exposes 1.05M on OpenAI's direct API and OpenRouter, and 400K on
+GitHub Copilot). At the default 50% trigger, compaction would fire at ~136K —
+half the window the model can actually use. When the active route is Codex
+OAuth (`provider: openai-codex`) and the model is gpt-5.5, Hermes raises the
+trigger to **85%** (~231K) and prints a one-time notice with the opt-out
+command. Only this exact route is affected; gpt-5.5 on any other provider keeps
+your global `threshold`. To opt back down to the global value:
+
+```bash
+hermes config set compression.codex_gpt55_autoraise false
+```

 ### Computed Values (for a 200K context model at defaults)