fix(session_search): improve truncation to center on actual query matches

Three-tier match strategy for _truncate_around_matches():
1. Full-phrase search (exact query string positions)
2. Proximity co-occurrence (all terms within 200 chars)
3. Individual terms (fallback, preserves existing behavior)

Sliding window picks the start offset covering the most matches.

Moved inline import re to module level.

Co-authored-by: Al Sayed Hoota <78100282+AlsayedHoota@users.noreply.github.com>
This commit is contained in:
Al Sayed Hoota 2026-04-13 04:52:03 -07:00 committed by Teknium
parent dbed40f39b
commit a5bc698b9a

View file

@ -19,6 +19,7 @@ import asyncio
import concurrent.futures
import json
import logging
import re
from typing import Dict, Any, List, Optional, Union
from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
@ -90,31 +91,80 @@ def _truncate_around_matches(
full_text: str, query: str, max_chars: int = MAX_SESSION_CHARS
) -> str:
"""
Truncate a conversation transcript to max_chars, centered around
where the query terms appear. Keeps content near matches, trims the edges.
Truncate a conversation transcript to *max_chars*, choosing a window
that maximises coverage of positions where the *query* actually appears.
Strategy (in priority order):
1. Try to find the full query as a phrase (case-insensitive).
2. If no phrase hit, look for positions where all query terms appear
within a 200-char proximity window (co-occurrence).
3. Fall back to individual term positions.
Once candidate positions are collected the function picks the window
start that covers the most of them.
"""
if len(full_text) <= max_chars:
return full_text
# Find the first occurrence of any query term
query_terms = query.lower().split()
text_lower = full_text.lower()
first_match = len(full_text)
for term in query_terms:
pos = text_lower.find(term)
if pos != -1 and pos < first_match:
first_match = pos
query_lower = query.lower().strip()
match_positions: list[int] = []
if first_match == len(full_text):
# No match found, take from the start
first_match = 0
# --- 1. Full-phrase search ------------------------------------------------
phrase_pat = re.compile(re.escape(query_lower))
match_positions = [m.start() for m in phrase_pat.finditer(text_lower)]
# Center the window around the first match
half = max_chars // 2
start = max(0, first_match - half)
# --- 2. Proximity co-occurrence of all terms (within 200 chars) -----------
if not match_positions:
terms = query_lower.split()
if len(terms) > 1:
# Collect every occurrence of each term
term_positions: dict[str, list[int]] = {}
for t in terms:
term_positions[t] = [
m.start() for m in re.finditer(re.escape(t), text_lower)
]
# Slide through positions of the rarest term and check proximity
rarest = min(terms, key=lambda t: len(term_positions.get(t, [])))
for pos in term_positions.get(rarest, []):
if all(
any(abs(p - pos) < 200 for p in term_positions.get(t, []))
for t in terms
if t != rarest
):
match_positions.append(pos)
# --- 3. Individual term positions (last resort) ---------------------------
if not match_positions:
terms = query_lower.split()
for t in terms:
for m in re.finditer(re.escape(t), text_lower):
match_positions.append(m.start())
if not match_positions:
# Nothing at all — take from the start
truncated = full_text[:max_chars]
suffix = "\n\n...[later conversation truncated]..." if max_chars < len(full_text) else ""
return truncated + suffix
# --- Pick window that covers the most match positions ---------------------
match_positions.sort()
best_start = 0
best_count = 0
for candidate in match_positions:
ws = max(0, candidate - max_chars // 4) # bias: 25% before, 75% after
we = ws + max_chars
if we > len(full_text):
ws = max(0, len(full_text) - max_chars)
we = len(full_text)
count = sum(1 for p in match_positions if ws <= p < we)
if count > best_count:
best_count = count
best_start = ws
start = best_start
end = min(len(full_text), start + max_chars)
if end - start < max_chars:
start = max(0, end - max_chars)
truncated = full_text[start:end]
prefix = "...[earlier conversation truncated]...\n\n" if start > 0 else ""