refactor(context_compressor): improve summary generation logic and error handling

Updated the _generate_summary method to attempt summary generation using the auxiliary model first, with a fallback to the main model. If both attempts fail, the method now returns None instead of a placeholder, allowing the caller to handle missing summaries appropriately. This change enhances the robustness of context compression and improves logging for failure scenarios.
This commit is contained in:
teknium1 2026-03-07 11:54:51 -08:00
parent 5baae0df88
commit 306d92a9d7

View file

@ -7,7 +7,7 @@ protecting head and tail context.
import logging import logging
import os import os
from typing import Any, Dict, List from typing import Any, Dict, List, Optional
from agent.auxiliary_client import get_text_auxiliary_client from agent.auxiliary_client import get_text_auxiliary_client
from agent.model_metadata import ( from agent.model_metadata import (
@ -82,11 +82,14 @@ class ContextCompressor:
"compression_count": self.compression_count, "compression_count": self.compression_count,
} }
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> str: def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
"""Generate a concise summary of conversation turns using a fast model.""" """Generate a concise summary of conversation turns.
if not self.client:
return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed to save space. The assistant performed various actions and received responses."
Tries the auxiliary model first, then falls back to the user's main
model. Returns None if all attempts fail the caller should drop
the middle turns without a summary rather than inject a useless
placeholder.
"""
parts = [] parts = []
for msg in turns_to_summarize: for msg in turns_to_summarize:
role = msg.get("role", "unknown") role = msg.get("role", "unknown")
@ -117,28 +120,28 @@ TURNS TO SUMMARIZE:
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
try: # 1. Try the auxiliary model (cheap/fast)
return self._call_summary_model(self.client, self.summary_model, prompt) if self.client:
except Exception as e: try:
logging.warning(f"Failed to generate context summary with auxiliary model: {e}") return self._call_summary_model(self.client, self.summary_model, prompt)
except Exception as e:
logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
# Fallback: try the main model's endpoint. This handles the common # 2. Fallback: try the user's main model endpoint
# case where the user switched providers (e.g. OpenRouter → local LLM) fallback_client, fallback_model = self._get_fallback_client()
# but a stale API key causes the auxiliary client to pick the old if fallback_client is not None:
# provider which then fails (402, auth error, etc.). try:
fallback_client, fallback_model = self._get_fallback_client() logger.info("Retrying context summary with main model (%s)", fallback_model)
if fallback_client is not None: summary = self._call_summary_model(fallback_client, fallback_model, prompt)
try: self.client = fallback_client
logger.info("Retrying context summary with fallback client (%s)", fallback_model) self.summary_model = fallback_model
summary = self._call_summary_model(fallback_client, fallback_model, prompt) return summary
# Success — swap in the working client for future compressions except Exception as fallback_err:
self.client = fallback_client logging.warning(f"Main model summary also failed: {fallback_err}")
self.summary_model = fallback_model
return summary
except Exception as fallback_err:
logging.warning(f"Fallback summary model also failed: {fallback_err}")
return "[CONTEXT SUMMARY]: Previous conversation turns have been compressed. The assistant performed tool calls and received responses." # 3. All models failed — return None so the caller drops turns without a summary
logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.")
return None
def _call_summary_model(self, client, model: str, prompt: str) -> str: def _call_summary_model(self, client, model: str, prompt: str) -> str:
"""Make the actual LLM call to generate a summary. Raises on failure.""" """Make the actual LLM call to generate a summary. Raises on failure."""
@ -326,25 +329,6 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)") print(f"\n📦 Context compression triggered ({display_tokens:,} tokens ≥ {self.threshold_tokens:,} threshold)")
print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})") print(f" 📊 Model context limit: {self.context_length:,} tokens ({self.threshold_percent*100:.0f}% = {self.threshold_tokens:,})")
# Truncation fallback when no auxiliary model is available
if self.client is None:
print("⚠️ Context compression: no auxiliary model available. Falling back to message truncation.")
# Keep system message(s) at the front and the protected tail;
# simply drop the oldest non-system messages until under threshold.
kept = []
for msg in messages:
if msg.get("role") == "system":
kept.append(msg.copy())
else:
break
tail = messages[-self.protect_last_n:]
kept.extend(m.copy() for m in tail)
self.compression_count += 1
kept = self._sanitize_tool_pairs(kept)
if not self.quiet_mode:
print(f" ✂️ Truncated: {len(messages)}{len(kept)} messages (dropped middle turns)")
return kept
if not self.quiet_mode: if not self.quiet_mode:
print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)") print(f" 🗜️ Summarizing turns {compress_start+1}-{compress_end} ({len(turns_to_summarize)} turns)")
@ -357,7 +341,11 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
msg["content"] = (msg.get("content") or "") + "\n\n[Note: Some earlier conversation turns may be summarized to preserve context space.]" msg["content"] = (msg.get("content") or "") + "\n\n[Note: Some earlier conversation turns may be summarized to preserve context space.]"
compressed.append(msg) compressed.append(msg)
compressed.append({"role": "user", "content": summary}) if summary:
compressed.append({"role": "user", "content": summary})
else:
if not self.quiet_mode:
print(" ⚠️ No summary model available — middle turns dropped without summary")
for i in range(compress_end, n_messages): for i in range(compress_end, n_messages):
compressed.append(messages[i].copy()) compressed.append(messages[i].copy())