diff --git a/run_agent.py b/run_agent.py
index c87bd35152..0f2b071c5e 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -9829,9 +9829,30 @@ class AIAgent:
                         prompt_tokens = canonical_usage.prompt_tokens
                         completion_tokens = canonical_usage.output_tokens
                         total_tokens = canonical_usage.total_tokens
+                        # For the context compressor, subtract reasoning
+                        # tokens from completion_tokens.  Reasoning tokens
+                        # (from completion_tokens_details.reasoning_tokens)
+                        # are internal chain-of-thought that the provider
+                        # bills as output but that do NOT appear in the
+                        # context window on the next turn.  Including them
+                        # inflates last_completion_tokens and causes
+                        # premature compression for thinking models
+                        # (GLM-5.1, QwQ, DeepSeek-R1).  Fixes #12026.
+                        _reasoning_toks = canonical_usage.reasoning_tokens
+                        _content_completion = max(
+                            0, completion_tokens - _reasoning_toks
+                        )
+                        if _reasoning_toks > 0:
+                            logger.info(
+                                "Reasoning tokens excluded from compression: "
+                                "%d reasoning of %d total completion → "
+                                "%d content tokens for compressor",
+                                _reasoning_toks, completion_tokens,
+                                _content_completion,
+                            )
                         usage_dict = {
                             "prompt_tokens": prompt_tokens,
-                            "completion_tokens": completion_tokens,
+                            "completion_tokens": _content_completion,
                             "total_tokens": total_tokens,
                         }
                         self.context_compressor.update_from_response(usage_dict)
@@ -9927,6 +9948,44 @@ class AIAgent:
                             hit_pct = (cached / prompt * 100) if prompt > 0 else 0
                             if not self.quiet_mode:
                                 self._vprint(f"{self.log_prefix}   💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
+                    else:
+                        # Provider returned no usage data (e.g. MiniMax via
+                        # OpenRouter ignores stream_options.include_usage).
+                        # Fall back to rough token estimation so sessions
+                        # don't permanently record 0/0 tokens.  Fixes #12023.
+                        _est_in = estimate_messages_tokens_rough(messages)
+                        _est_out = estimate_tokens_rough(
+                            (response.choices[0].message.content or "")
+                            if response.choices else ""
+                        )
+                        _est_total = _est_in + _est_out
+                        logger.warning(
+                            "No usage data in response for model=%s provider=%s "
+                            "— using rough estimates (in≈%d, out≈%d)",
+                            self.model, self.provider or "unknown",
+                            _est_in, _est_out,
+                        )
+                        self.context_compressor.update_from_response({
+                            "prompt_tokens": _est_in,
+                            "completion_tokens": _est_out,
+                            "total_tokens": _est_total,
+                        })
+                        self.session_prompt_tokens += _est_in
+                        self.session_completion_tokens += _est_out
+                        self.session_total_tokens += _est_total
+                        self.session_api_calls += 1
+                        self.session_input_tokens += _est_in
+                        self.session_output_tokens += _est_out
+                        if self._session_db and self.session_id:
+                            try:
+                                self._session_db.update_token_counts(
+                                    self.session_id,
+                                    input_tokens=_est_in,
+                                    output_tokens=_est_out,
+                                    model=self.model,
+                                )
+                            except Exception:
+                                pass  # never block the agent loop
                     
                     has_retried_429 = False  # Reset on success
                     # Clear Nous rate limit state on successful request —
diff --git a/tests/run_agent/test_token_accounting_fallback.py b/tests/run_agent/test_token_accounting_fallback.py
new file mode 100644
index 0000000000..9ad6284198
--- /dev/null
+++ b/tests/run_agent/test_token_accounting_fallback.py
@@ -0,0 +1,175 @@
+"""Regression tests for token accounting edge cases.
+
+Fix 1 (#12023): When a provider returns no usage data in the streaming
+response (e.g. MiniMax via OpenRouter ignoring stream_options.include_usage),
+the agent falls back to rough token estimation so sessions don't permanently
+record 0/0 tokens.
+
+Fix 2 (#12026): Reasoning tokens (from completion_tokens_details) are
+subtracted from the completion_tokens fed to the context compressor.
+Reasoning tokens are internal chain-of-thought that don't appear in the
+context window on the next turn; including them caused premature
+compression for thinking models (GLM-5.1, QwQ, DeepSeek-R1).
+"""
+
+from unittest.mock import patch
+
+import pytest
+
+from agent.context_compressor import ContextCompressor
+from agent.usage_pricing import CanonicalUsage
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+
+@pytest.fixture()
+def compressor_200k():
+    """ContextCompressor with a 200K context window (GLM-5.1 sized)."""
+    with patch(
+        "agent.model_metadata.get_model_context_length", return_value=200_000
+    ):
+        return ContextCompressor(
+            model="z-ai/glm-5.1",
+            threshold_percent=0.50,
+            quiet_mode=True,
+        )
+
+
+# ── Fix 2: reasoning tokens excluded from compressor ─────────────────
+
+
+class TestReasoningTokenExclusion:
+    """Verify that reasoning tokens are subtracted before feeding the
+    context compressor, while session-level billing counters keep the
+    full amount."""
+
+    def test_reasoning_subtracted_from_compressor(self, compressor_200k):
+        """Compressor should see content-only completion tokens."""
+        compressor = compressor_200k
+
+        # Simulate: 80K prompt, 20K completion (15K reasoning + 5K content)
+        canonical = CanonicalUsage(
+            input_tokens=80_000,
+            output_tokens=20_000,
+            reasoning_tokens=15_000,
+        )
+        content_completion = canonical.output_tokens - canonical.reasoning_tokens
+        compressor.update_from_response({
+            "prompt_tokens": canonical.prompt_tokens,
+            "completion_tokens": content_completion,
+            "total_tokens": canonical.total_tokens,
+        })
+
+        assert compressor.last_completion_tokens == 5_000
+        assert compressor.last_prompt_tokens == canonical.prompt_tokens
+
+    def test_no_premature_compression_with_reasoning(self, compressor_200k):
+        """85K prompt + 20K reasoning should NOT trigger compression at
+        50% of 200K (100K threshold).  Without the fix, 85K + 20K = 105K
+        would exceed the threshold."""
+        compressor = compressor_200k
+        # threshold = 100_000
+
+        canonical = CanonicalUsage(
+            input_tokens=85_000,
+            output_tokens=20_000,
+            reasoning_tokens=15_000,
+        )
+        content_completion = canonical.output_tokens - canonical.reasoning_tokens
+        compressor.update_from_response({
+            "prompt_tokens": canonical.prompt_tokens,
+            "completion_tokens": content_completion,
+            "total_tokens": canonical.total_tokens,
+        })
+
+        # prompt_tokens (85K) + content_completion (5K) = 90K < 100K threshold
+        _real = compressor.last_prompt_tokens + compressor.last_completion_tokens
+        assert _real == 90_000
+        assert not compressor.should_compress(_real)
+
+    def test_compression_fires_when_truly_full(self, compressor_200k):
+        """When prompt alone exceeds the threshold, compression must still
+        fire regardless of reasoning subtraction."""
+        compressor = compressor_200k
+
+        canonical = CanonicalUsage(
+            input_tokens=105_000,
+            output_tokens=5_000,
+            reasoning_tokens=3_000,
+        )
+        content_completion = canonical.output_tokens - canonical.reasoning_tokens
+        compressor.update_from_response({
+            "prompt_tokens": canonical.prompt_tokens,
+            "completion_tokens": content_completion,
+            "total_tokens": canonical.total_tokens,
+        })
+
+        _real = compressor.last_prompt_tokens + compressor.last_completion_tokens
+        assert _real == 107_000  # 105K + 2K
+        assert compressor.should_compress(_real)
+
+    def test_zero_reasoning_tokens_no_change(self, compressor_200k):
+        """For non-thinking models (reasoning_tokens=0), the formula is
+        identical to the old prompt+completion behavior."""
+        compressor = compressor_200k
+
+        canonical = CanonicalUsage(
+            input_tokens=80_000,
+            output_tokens=10_000,
+            reasoning_tokens=0,
+        )
+        content_completion = canonical.output_tokens - canonical.reasoning_tokens
+        compressor.update_from_response({
+            "prompt_tokens": canonical.prompt_tokens,
+            "completion_tokens": content_completion,
+            "total_tokens": canonical.total_tokens,
+        })
+
+        assert compressor.last_completion_tokens == 10_000
+        _real = compressor.last_prompt_tokens + compressor.last_completion_tokens
+        assert _real == 90_000
+
+
+# ── Fix 1: token estimation fallback when usage is None ──────────────
+
+
+class TestTokenEstimationFallback:
+    """Verify that when response.usage is None, rough token estimation
+    populates the compressor and session counters."""
+
+    def test_compressor_gets_nonzero_on_missing_usage(self, compressor_200k):
+        """Simulates the fallback path: estimate_messages_tokens_rough
+        produces non-zero values that update the compressor."""
+        compressor = compressor_200k
+
+        # Before: compressor has no data
+        assert compressor.last_prompt_tokens == 0
+        assert compressor.last_completion_tokens == 0
+
+        # Simulate fallback estimation
+        est_in = 5000  # rough estimate from messages
+        est_out = 200  # rough estimate from response content
+        compressor.update_from_response({
+            "prompt_tokens": est_in,
+            "completion_tokens": est_out,
+            "total_tokens": est_in + est_out,
+        })
+
+        assert compressor.last_prompt_tokens == est_in
+        assert compressor.last_completion_tokens == est_out
+
+    def test_fallback_prevents_zero_session_tokens(self):
+        """Session counters must be non-zero after the fallback path."""
+        # This tests the *pattern*, not the full agent integration.
+        session_prompt = 0
+        session_completion = 0
+
+        est_in = 3000
+        est_out = 150
+
+        session_prompt += est_in
+        session_completion += est_out
+
+        assert session_prompt > 0
+        assert session_completion > 0