From a5bc698b9a771e28921a5f10cbcad1f18a4e02b9 Mon Sep 17 00:00:00 2001 From: Al Sayed Hoota <78100282+AlsayedHoota@users.noreply.github.com> Date: Mon, 13 Apr 2026 04:52:03 -0700 Subject: [PATCH] fix(session_search): improve truncation to center on actual query matches Three-tier match strategy for _truncate_around_matches(): 1. Full-phrase search (exact query string positions) 2. Proximity co-occurrence (all terms within 200 chars) 3. Individual terms (fallback, preserves existing behavior) Sliding window picks the start offset covering the most matches. Moved inline import re to module level. Co-authored-by: Al Sayed Hoota <78100282+AlsayedHoota@users.noreply.github.com> --- tools/session_search_tool.py | 84 ++++++++++++++++++++++++++++-------- 1 file changed, 67 insertions(+), 17 deletions(-) diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index 3e9c68af40..9be73a04a3 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -19,6 +19,7 @@ import asyncio import concurrent.futures import json import logging +import re from typing import Dict, Any, List, Optional, Union from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning @@ -90,31 +91,80 @@ def _truncate_around_matches( full_text: str, query: str, max_chars: int = MAX_SESSION_CHARS ) -> str: """ - Truncate a conversation transcript to max_chars, centered around - where the query terms appear. Keeps content near matches, trims the edges. + Truncate a conversation transcript to *max_chars*, choosing a window + that maximises coverage of positions where the *query* actually appears. + + Strategy (in priority order): + 1. Try to find the full query as a phrase (case-insensitive). + 2. If no phrase hit, look for positions where all query terms appear + within a 200-char proximity window (co-occurrence). + 3. Fall back to individual term positions. + + Once candidate positions are collected the function picks the window + start that covers the most of them. """ if len(full_text) <= max_chars: return full_text - # Find the first occurrence of any query term - query_terms = query.lower().split() text_lower = full_text.lower() - first_match = len(full_text) - for term in query_terms: - pos = text_lower.find(term) - if pos != -1 and pos < first_match: - first_match = pos + query_lower = query.lower().strip() + match_positions: list[int] = [] - if first_match == len(full_text): - # No match found, take from the start - first_match = 0 + # --- 1. Full-phrase search ------------------------------------------------ + phrase_pat = re.compile(re.escape(query_lower)) + match_positions = [m.start() for m in phrase_pat.finditer(text_lower)] - # Center the window around the first match - half = max_chars // 2 - start = max(0, first_match - half) + # --- 2. Proximity co-occurrence of all terms (within 200 chars) ----------- + if not match_positions: + terms = query_lower.split() + if len(terms) > 1: + # Collect every occurrence of each term + term_positions: dict[str, list[int]] = {} + for t in terms: + term_positions[t] = [ + m.start() for m in re.finditer(re.escape(t), text_lower) + ] + # Slide through positions of the rarest term and check proximity + rarest = min(terms, key=lambda t: len(term_positions.get(t, []))) + for pos in term_positions.get(rarest, []): + if all( + any(abs(p - pos) < 200 for p in term_positions.get(t, [])) + for t in terms + if t != rarest + ): + match_positions.append(pos) + + # --- 3. Individual term positions (last resort) --------------------------- + if not match_positions: + terms = query_lower.split() + for t in terms: + for m in re.finditer(re.escape(t), text_lower): + match_positions.append(m.start()) + + if not match_positions: + # Nothing at all — take from the start + truncated = full_text[:max_chars] + suffix = "\n\n...[later conversation truncated]..." if max_chars < len(full_text) else "" + return truncated + suffix + + # --- Pick window that covers the most match positions --------------------- + match_positions.sort() + + best_start = 0 + best_count = 0 + for candidate in match_positions: + ws = max(0, candidate - max_chars // 4) # bias: 25% before, 75% after + we = ws + max_chars + if we > len(full_text): + ws = max(0, len(full_text) - max_chars) + we = len(full_text) + count = sum(1 for p in match_positions if ws <= p < we) + if count > best_count: + best_count = count + best_start = ws + + start = best_start end = min(len(full_text), start + max_chars) - if end - start < max_chars: - start = max(0, end - max_chars) truncated = full_text[start:end] prefix = "...[earlier conversation truncated]...\n\n" if start > 0 else ""