Port from Kilo-Org/kilocode#9434: strip historical media after compression (#27189)

After context compression, the protected tail messages retain their
original image parts. When those include multi-MB pasted screenshots,
every subsequent API request re-ships the same base-64 blobs forever —
which can push the request past provider body-size limits and wedge the
session even though compression 'succeeded'.

Add _strip_historical_media() to agent/context_compressor.py. After the
summary is built, find the newest user message that carries an image
part and replace image parts in every earlier message with a short
text placeholder ('[Attached image — stripped after compression]').
The newest image-bearing user turn keeps its media so the model can
still analyse what the user just sent.

Handles all three multimodal shapes:
  - OpenAI chat.completions image_url
  - OpenAI Responses API input_image
  - Anthropic native {type: image, source: ...}

Includes 27 unit tests covering the helpers and the end-to-end
compress() integration, plus a manual E2E check confirming a ~4MB
two-image conversation shrinks to ~2MB after compression.
This commit is contained in:
Teknium 2026-05-16 17:18:25 -07:00 committed by GitHub
parent 5cbe0b1c4f
commit 3b39096904
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 382 additions and 0 deletions

View file

@ -221,6 +221,114 @@ def _truncate_tool_call_args_json(args: str, head_chars: int = 200) -> str:
return json.dumps(shrunken, ensure_ascii=False)
_IMAGE_PART_TYPES = frozenset({"image_url", "input_image", "image"})
def _is_image_part(part: Any) -> bool:
"""True if ``part`` is a multimodal image content block.
Recognizes all three shapes the agent handles:
- OpenAI chat.completions: ``{"type": "image_url", "image_url": ...}``
- OpenAI Responses API: ``{"type": "input_image", "image_url": "..."}``
- Anthropic native: ``{"type": "image", "source": {...}}``
"""
if not isinstance(part, dict):
return False
return part.get("type") in _IMAGE_PART_TYPES
def _content_has_images(content: Any) -> bool:
"""True if a message's ``content`` is a multimodal list with image parts."""
if not isinstance(content, list):
return False
return any(_is_image_part(p) for p in content)
def _strip_images_from_content(content: Any) -> Any:
"""Return a copy of ``content`` with every image part replaced by a
short text placeholder.
- String content is returned unchanged.
- Non-list, non-string content is returned unchanged.
- List content: image parts become ``{"type": "text", "text": "[Attached
image stripped after compression]"}``; other parts are preserved as-is.
Input is never mutated.
"""
if not isinstance(content, list):
return content
if not any(_is_image_part(p) for p in content):
return content
new_parts: List[Any] = []
for p in content:
if _is_image_part(p):
new_parts.append({
"type": "text",
"text": "[Attached image — stripped after compression]",
})
else:
new_parts.append(p)
return new_parts
def _strip_historical_media(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Replace image parts in older messages with placeholder text.
The anchor is the *last* user message that has any image content. Every
message before that anchor gets its image parts replaced with a short
placeholder so the outgoing request stops re-shipping the same multi-MB
base-64 image blobs on every turn.
If no user message carries images, the list is returned unchanged.
If the only user message with images is the very first one (nothing
earlier to strip), the list is returned unchanged.
Shallow copies of touched messages only; input is never mutated.
Port of Kilo-Org/kilocode#9434 (adapted for the OpenAI-style message
shape the hermes compressor emits).
"""
if not messages:
return messages
# Find the newest user message that carries at least one image part.
# We anchor on image-bearing user messages (not all user messages) so
# a plain text follow-up after a big-image turn still strips the old
# image — matching the problem kilocode#9434 set out to solve.
anchor = -1
for i in range(len(messages) - 1, -1, -1):
msg = messages[i]
if not isinstance(msg, dict):
continue
if msg.get("role") != "user":
continue
if _content_has_images(msg.get("content")):
anchor = i
break
if anchor <= 0:
# No image-bearing user message, or it's the very first message —
# nothing before it to strip.
return messages
changed = False
result: List[Dict[str, Any]] = []
for i, msg in enumerate(messages):
if i >= anchor or not isinstance(msg, dict):
result.append(msg)
continue
content = msg.get("content")
if not _content_has_images(content):
result.append(msg)
continue
new_msg = msg.copy()
new_msg["content"] = _strip_images_from_content(content)
result.append(new_msg)
changed = True
return result if changed else messages
def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
"""Create an informative 1-line summary of a tool call + result.
@ -1559,6 +1667,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
compressed = self._sanitize_tool_pairs(compressed)
# Replace image parts in all compressed messages before the newest
# image-bearing user turn with a short text placeholder. Without
# this, tail messages keep their original multi-MB base-64 image
# payloads forever, which can push every subsequent API request
# past the provider's body-size limit and wedge the session.
# Port of Kilo-Org/kilocode#9434.
compressed = _strip_historical_media(compressed)
new_estimate = estimate_messages_tokens_rough(compressed)
saved_estimate = display_tokens - new_estimate