diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index 3e9c68af4..9be73a04a 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -19,6 +19,7 @@ import asyncio import concurrent.futures import json import logging +import re from typing import Dict, Any, List, Optional, Union from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning @@ -90,31 +91,80 @@ def _truncate_around_matches( full_text: str, query: str, max_chars: int = MAX_SESSION_CHARS ) -> str: """ - Truncate a conversation transcript to max_chars, centered around - where the query terms appear. Keeps content near matches, trims the edges. + Truncate a conversation transcript to *max_chars*, choosing a window + that maximises coverage of positions where the *query* actually appears. + + Strategy (in priority order): + 1. Try to find the full query as a phrase (case-insensitive). + 2. If no phrase hit, look for positions where all query terms appear + within a 200-char proximity window (co-occurrence). + 3. Fall back to individual term positions. + + Once candidate positions are collected the function picks the window + start that covers the most of them. """ if len(full_text) <= max_chars: return full_text - # Find the first occurrence of any query term - query_terms = query.lower().split() text_lower = full_text.lower() - first_match = len(full_text) - for term in query_terms: - pos = text_lower.find(term) - if pos != -1 and pos < first_match: - first_match = pos + query_lower = query.lower().strip() + match_positions: list[int] = [] - if first_match == len(full_text): - # No match found, take from the start - first_match = 0 + # --- 1. Full-phrase search ------------------------------------------------ + phrase_pat = re.compile(re.escape(query_lower)) + match_positions = [m.start() for m in phrase_pat.finditer(text_lower)] - # Center the window around the first match - half = max_chars // 2 - start = max(0, first_match - half) + # --- 2. Proximity co-occurrence of all terms (within 200 chars) ----------- + if not match_positions: + terms = query_lower.split() + if len(terms) > 1: + # Collect every occurrence of each term + term_positions: dict[str, list[int]] = {} + for t in terms: + term_positions[t] = [ + m.start() for m in re.finditer(re.escape(t), text_lower) + ] + # Slide through positions of the rarest term and check proximity + rarest = min(terms, key=lambda t: len(term_positions.get(t, []))) + for pos in term_positions.get(rarest, []): + if all( + any(abs(p - pos) < 200 for p in term_positions.get(t, [])) + for t in terms + if t != rarest + ): + match_positions.append(pos) + + # --- 3. Individual term positions (last resort) --------------------------- + if not match_positions: + terms = query_lower.split() + for t in terms: + for m in re.finditer(re.escape(t), text_lower): + match_positions.append(m.start()) + + if not match_positions: + # Nothing at all — take from the start + truncated = full_text[:max_chars] + suffix = "\n\n...[later conversation truncated]..." if max_chars < len(full_text) else "" + return truncated + suffix + + # --- Pick window that covers the most match positions --------------------- + match_positions.sort() + + best_start = 0 + best_count = 0 + for candidate in match_positions: + ws = max(0, candidate - max_chars // 4) # bias: 25% before, 75% after + we = ws + max_chars + if we > len(full_text): + ws = max(0, len(full_text) - max_chars) + we = len(full_text) + count = sum(1 for p in match_positions if ws <= p < we) + if count > best_count: + best_count = count + best_start = ws + + start = best_start end = min(len(full_text), start + max_chars) - if end - start < max_chars: - start = max(0, end - max_chars) truncated = full_text[start:end] prefix = "...[earlier conversation truncated]...\n\n" if start > 0 else ""