diff --git a/run_agent.py b/run_agent.py index c87bd35152..0f2b071c5e 100644 --- a/run_agent.py +++ b/run_agent.py @@ -9829,9 +9829,30 @@ class AIAgent: prompt_tokens = canonical_usage.prompt_tokens completion_tokens = canonical_usage.output_tokens total_tokens = canonical_usage.total_tokens + # For the context compressor, subtract reasoning + # tokens from completion_tokens. Reasoning tokens + # (from completion_tokens_details.reasoning_tokens) + # are internal chain-of-thought that the provider + # bills as output but that do NOT appear in the + # context window on the next turn. Including them + # inflates last_completion_tokens and causes + # premature compression for thinking models + # (GLM-5.1, QwQ, DeepSeek-R1). Fixes #12026. + _reasoning_toks = canonical_usage.reasoning_tokens + _content_completion = max( + 0, completion_tokens - _reasoning_toks + ) + if _reasoning_toks > 0: + logger.info( + "Reasoning tokens excluded from compression: " + "%d reasoning of %d total completion β†’ " + "%d content tokens for compressor", + _reasoning_toks, completion_tokens, + _content_completion, + ) usage_dict = { "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, + "completion_tokens": _content_completion, "total_tokens": total_tokens, } self.context_compressor.update_from_response(usage_dict) @@ -9927,6 +9948,44 @@ class AIAgent: hit_pct = (cached / prompt * 100) if prompt > 0 else 0 if not self.quiet_mode: self._vprint(f"{self.log_prefix} πŸ’Ύ Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)") + else: + # Provider returned no usage data (e.g. MiniMax via + # OpenRouter ignores stream_options.include_usage). + # Fall back to rough token estimation so sessions + # don't permanently record 0/0 tokens. Fixes #12023. + _est_in = estimate_messages_tokens_rough(messages) + _est_out = estimate_tokens_rough( + (response.choices[0].message.content or "") + if response.choices else "" + ) + _est_total = _est_in + _est_out + logger.warning( + "No usage data in response for model=%s provider=%s " + "β€” using rough estimates (inβ‰ˆ%d, outβ‰ˆ%d)", + self.model, self.provider or "unknown", + _est_in, _est_out, + ) + self.context_compressor.update_from_response({ + "prompt_tokens": _est_in, + "completion_tokens": _est_out, + "total_tokens": _est_total, + }) + self.session_prompt_tokens += _est_in + self.session_completion_tokens += _est_out + self.session_total_tokens += _est_total + self.session_api_calls += 1 + self.session_input_tokens += _est_in + self.session_output_tokens += _est_out + if self._session_db and self.session_id: + try: + self._session_db.update_token_counts( + self.session_id, + input_tokens=_est_in, + output_tokens=_est_out, + model=self.model, + ) + except Exception: + pass # never block the agent loop has_retried_429 = False # Reset on success # Clear Nous rate limit state on successful request β€” diff --git a/tests/run_agent/test_token_accounting_fallback.py b/tests/run_agent/test_token_accounting_fallback.py new file mode 100644 index 0000000000..9ad6284198 --- /dev/null +++ b/tests/run_agent/test_token_accounting_fallback.py @@ -0,0 +1,175 @@ +"""Regression tests for token accounting edge cases. + +Fix 1 (#12023): When a provider returns no usage data in the streaming +response (e.g. MiniMax via OpenRouter ignoring stream_options.include_usage), +the agent falls back to rough token estimation so sessions don't permanently +record 0/0 tokens. + +Fix 2 (#12026): Reasoning tokens (from completion_tokens_details) are +subtracted from the completion_tokens fed to the context compressor. +Reasoning tokens are internal chain-of-thought that don't appear in the +context window on the next turn; including them caused premature +compression for thinking models (GLM-5.1, QwQ, DeepSeek-R1). +""" + +from unittest.mock import patch + +import pytest + +from agent.context_compressor import ContextCompressor +from agent.usage_pricing import CanonicalUsage + + +# ── Helpers ────────────────────────────────────────────────────────── + + +@pytest.fixture() +def compressor_200k(): + """ContextCompressor with a 200K context window (GLM-5.1 sized).""" + with patch( + "agent.model_metadata.get_model_context_length", return_value=200_000 + ): + return ContextCompressor( + model="z-ai/glm-5.1", + threshold_percent=0.50, + quiet_mode=True, + ) + + +# ── Fix 2: reasoning tokens excluded from compressor ───────────────── + + +class TestReasoningTokenExclusion: + """Verify that reasoning tokens are subtracted before feeding the + context compressor, while session-level billing counters keep the + full amount.""" + + def test_reasoning_subtracted_from_compressor(self, compressor_200k): + """Compressor should see content-only completion tokens.""" + compressor = compressor_200k + + # Simulate: 80K prompt, 20K completion (15K reasoning + 5K content) + canonical = CanonicalUsage( + input_tokens=80_000, + output_tokens=20_000, + reasoning_tokens=15_000, + ) + content_completion = canonical.output_tokens - canonical.reasoning_tokens + compressor.update_from_response({ + "prompt_tokens": canonical.prompt_tokens, + "completion_tokens": content_completion, + "total_tokens": canonical.total_tokens, + }) + + assert compressor.last_completion_tokens == 5_000 + assert compressor.last_prompt_tokens == canonical.prompt_tokens + + def test_no_premature_compression_with_reasoning(self, compressor_200k): + """85K prompt + 20K reasoning should NOT trigger compression at + 50% of 200K (100K threshold). Without the fix, 85K + 20K = 105K + would exceed the threshold.""" + compressor = compressor_200k + # threshold = 100_000 + + canonical = CanonicalUsage( + input_tokens=85_000, + output_tokens=20_000, + reasoning_tokens=15_000, + ) + content_completion = canonical.output_tokens - canonical.reasoning_tokens + compressor.update_from_response({ + "prompt_tokens": canonical.prompt_tokens, + "completion_tokens": content_completion, + "total_tokens": canonical.total_tokens, + }) + + # prompt_tokens (85K) + content_completion (5K) = 90K < 100K threshold + _real = compressor.last_prompt_tokens + compressor.last_completion_tokens + assert _real == 90_000 + assert not compressor.should_compress(_real) + + def test_compression_fires_when_truly_full(self, compressor_200k): + """When prompt alone exceeds the threshold, compression must still + fire regardless of reasoning subtraction.""" + compressor = compressor_200k + + canonical = CanonicalUsage( + input_tokens=105_000, + output_tokens=5_000, + reasoning_tokens=3_000, + ) + content_completion = canonical.output_tokens - canonical.reasoning_tokens + compressor.update_from_response({ + "prompt_tokens": canonical.prompt_tokens, + "completion_tokens": content_completion, + "total_tokens": canonical.total_tokens, + }) + + _real = compressor.last_prompt_tokens + compressor.last_completion_tokens + assert _real == 107_000 # 105K + 2K + assert compressor.should_compress(_real) + + def test_zero_reasoning_tokens_no_change(self, compressor_200k): + """For non-thinking models (reasoning_tokens=0), the formula is + identical to the old prompt+completion behavior.""" + compressor = compressor_200k + + canonical = CanonicalUsage( + input_tokens=80_000, + output_tokens=10_000, + reasoning_tokens=0, + ) + content_completion = canonical.output_tokens - canonical.reasoning_tokens + compressor.update_from_response({ + "prompt_tokens": canonical.prompt_tokens, + "completion_tokens": content_completion, + "total_tokens": canonical.total_tokens, + }) + + assert compressor.last_completion_tokens == 10_000 + _real = compressor.last_prompt_tokens + compressor.last_completion_tokens + assert _real == 90_000 + + +# ── Fix 1: token estimation fallback when usage is None ────────────── + + +class TestTokenEstimationFallback: + """Verify that when response.usage is None, rough token estimation + populates the compressor and session counters.""" + + def test_compressor_gets_nonzero_on_missing_usage(self, compressor_200k): + """Simulates the fallback path: estimate_messages_tokens_rough + produces non-zero values that update the compressor.""" + compressor = compressor_200k + + # Before: compressor has no data + assert compressor.last_prompt_tokens == 0 + assert compressor.last_completion_tokens == 0 + + # Simulate fallback estimation + est_in = 5000 # rough estimate from messages + est_out = 200 # rough estimate from response content + compressor.update_from_response({ + "prompt_tokens": est_in, + "completion_tokens": est_out, + "total_tokens": est_in + est_out, + }) + + assert compressor.last_prompt_tokens == est_in + assert compressor.last_completion_tokens == est_out + + def test_fallback_prevents_zero_session_tokens(self): + """Session counters must be non-zero after the fallback path.""" + # This tests the *pattern*, not the full agent integration. + session_prompt = 0 + session_completion = 0 + + est_in = 3000 + est_out = 150 + + session_prompt += est_in + session_completion += est_out + + assert session_prompt > 0 + assert session_completion > 0