fix(session_search): improve truncation to center on actual query matches

Three-tier match strategy for _truncate_around_matches(): 1. Full-phrase search (exact query string positions) 2. Proximity co-occurrence (all terms within 200 chars) 3. Individual terms (fallback, preserves existing behavior) Sliding window picks the start offset covering the most matches. Moved inline import re to module level. Co-authored-by: Al Sayed Hoota <78100282+AlsayedHoota@users.noreply.github.com>
2026-06-17 09:41:58 +00:00 · 2026-04-13 04:52:03 -07:00 · 2026-04-13 04:52:03 -07:00 · a5bc698b9a
commit a5bc698b9a
parent dbed40f39b
1 changed files with 67 additions and 17 deletions
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@ -19,6 +19,7 @@ import asyncio
 import concurrent.futures
 import json
 import logging
+import re
 from typing import Dict, Any, List, Optional, Union

 from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
@ -90,31 +91,80 @@ def _truncate_around_matches(
    full_text: str, query: str, max_chars: int = MAX_SESSION_CHARS
 ) -> str:
    """
-    Truncate a conversation transcript to max_chars, centered around
-    where the query terms appear. Keeps content near matches, trims the edges.
+    Truncate a conversation transcript to *max_chars*, choosing a window
+    that maximises coverage of positions where the *query* actually appears.
+
+    Strategy (in priority order):
+    1. Try to find the full query as a phrase (case-insensitive).
+    2. If no phrase hit, look for positions where all query terms appear
+       within a 200-char proximity window (co-occurrence).
+    3. Fall back to individual term positions.
+
+    Once candidate positions are collected the function picks the window
+    start that covers the most of them.
    """
    if len(full_text) <= max_chars:
        return full_text

-    # Find the first occurrence of any query term
-    query_terms = query.lower().split()
    text_lower = full_text.lower()
-    first_match = len(full_text)
-    for term in query_terms:
-        pos = text_lower.find(term)
-        if pos != -1 and pos < first_match:
-            first_match = pos
+    query_lower = query.lower().strip()
+    match_positions: list[int] = []

-    if first_match == len(full_text):
-        # No match found, take from the start
-        first_match = 0
+    # --- 1. Full-phrase search ------------------------------------------------
+    phrase_pat = re.compile(re.escape(query_lower))
+    match_positions = [m.start() for m in phrase_pat.finditer(text_lower)]

-    # Center the window around the first match
-    half = max_chars // 2
-    start = max(0, first_match - half)
+    # --- 2. Proximity co-occurrence of all terms (within 200 chars) -----------
+    if not match_positions:
+        terms = query_lower.split()
+        if len(terms) > 1:
+            # Collect every occurrence of each term
+            term_positions: dict[str, list[int]] = {}
+            for t in terms:
+                term_positions[t] = [
+                    m.start() for m in re.finditer(re.escape(t), text_lower)
+                ]
+            # Slide through positions of the rarest term and check proximity
+            rarest = min(terms, key=lambda t: len(term_positions.get(t, [])))
+            for pos in term_positions.get(rarest, []):
+                if all(
+                    any(abs(p - pos) < 200 for p in term_positions.get(t, []))
+                    for t in terms
+                    if t != rarest
+                ):
+                    match_positions.append(pos)
+
+    # --- 3. Individual term positions (last resort) ---------------------------
+    if not match_positions:
+        terms = query_lower.split()
+        for t in terms:
+            for m in re.finditer(re.escape(t), text_lower):
+                match_positions.append(m.start())
+
+    if not match_positions:
+        # Nothing at all — take from the start
+        truncated = full_text[:max_chars]
+        suffix = "\n\n...[later conversation truncated]..." if max_chars < len(full_text) else ""
+        return truncated + suffix
+
+    # --- Pick window that covers the most match positions ---------------------
+    match_positions.sort()
+
+    best_start = 0
+    best_count = 0
+    for candidate in match_positions:
+        ws = max(0, candidate - max_chars // 4)  # bias: 25% before, 75% after
+        we = ws + max_chars
+        if we > len(full_text):
+            ws = max(0, len(full_text) - max_chars)
+            we = len(full_text)
+        count = sum(1 for p in match_positions if ws <= p < we)
+        if count > best_count:
+            best_count = count
+            best_start = ws
+
+    start = best_start
    end = min(len(full_text), start + max_chars)
-    if end - start < max_chars:
-        start = max(0, end - max_chars)

    truncated = full_text[start:end]
    prefix = "...[earlier conversation truncated]...\n\n" if start > 0 else ""