mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(session_search): improve truncation to center on actual query matches
Three-tier match strategy for _truncate_around_matches(): 1. Full-phrase search (exact query string positions) 2. Proximity co-occurrence (all terms within 200 chars) 3. Individual terms (fallback, preserves existing behavior) Sliding window picks the start offset covering the most matches. Moved inline import re to module level. Co-authored-by: Al Sayed Hoota <78100282+AlsayedHoota@users.noreply.github.com>
This commit is contained in:
parent
dbed40f39b
commit
a5bc698b9a
1 changed files with 67 additions and 17 deletions
|
|
@ -19,6 +19,7 @@ import asyncio
|
|||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from typing import Dict, Any, List, Optional, Union
|
||||
|
||||
from agent.auxiliary_client import async_call_llm, extract_content_or_reasoning
|
||||
|
|
@ -90,31 +91,80 @@ def _truncate_around_matches(
|
|||
full_text: str, query: str, max_chars: int = MAX_SESSION_CHARS
|
||||
) -> str:
|
||||
"""
|
||||
Truncate a conversation transcript to max_chars, centered around
|
||||
where the query terms appear. Keeps content near matches, trims the edges.
|
||||
Truncate a conversation transcript to *max_chars*, choosing a window
|
||||
that maximises coverage of positions where the *query* actually appears.
|
||||
|
||||
Strategy (in priority order):
|
||||
1. Try to find the full query as a phrase (case-insensitive).
|
||||
2. If no phrase hit, look for positions where all query terms appear
|
||||
within a 200-char proximity window (co-occurrence).
|
||||
3. Fall back to individual term positions.
|
||||
|
||||
Once candidate positions are collected the function picks the window
|
||||
start that covers the most of them.
|
||||
"""
|
||||
if len(full_text) <= max_chars:
|
||||
return full_text
|
||||
|
||||
# Find the first occurrence of any query term
|
||||
query_terms = query.lower().split()
|
||||
text_lower = full_text.lower()
|
||||
first_match = len(full_text)
|
||||
for term in query_terms:
|
||||
pos = text_lower.find(term)
|
||||
if pos != -1 and pos < first_match:
|
||||
first_match = pos
|
||||
query_lower = query.lower().strip()
|
||||
match_positions: list[int] = []
|
||||
|
||||
if first_match == len(full_text):
|
||||
# No match found, take from the start
|
||||
first_match = 0
|
||||
# --- 1. Full-phrase search ------------------------------------------------
|
||||
phrase_pat = re.compile(re.escape(query_lower))
|
||||
match_positions = [m.start() for m in phrase_pat.finditer(text_lower)]
|
||||
|
||||
# Center the window around the first match
|
||||
half = max_chars // 2
|
||||
start = max(0, first_match - half)
|
||||
# --- 2. Proximity co-occurrence of all terms (within 200 chars) -----------
|
||||
if not match_positions:
|
||||
terms = query_lower.split()
|
||||
if len(terms) > 1:
|
||||
# Collect every occurrence of each term
|
||||
term_positions: dict[str, list[int]] = {}
|
||||
for t in terms:
|
||||
term_positions[t] = [
|
||||
m.start() for m in re.finditer(re.escape(t), text_lower)
|
||||
]
|
||||
# Slide through positions of the rarest term and check proximity
|
||||
rarest = min(terms, key=lambda t: len(term_positions.get(t, [])))
|
||||
for pos in term_positions.get(rarest, []):
|
||||
if all(
|
||||
any(abs(p - pos) < 200 for p in term_positions.get(t, []))
|
||||
for t in terms
|
||||
if t != rarest
|
||||
):
|
||||
match_positions.append(pos)
|
||||
|
||||
# --- 3. Individual term positions (last resort) ---------------------------
|
||||
if not match_positions:
|
||||
terms = query_lower.split()
|
||||
for t in terms:
|
||||
for m in re.finditer(re.escape(t), text_lower):
|
||||
match_positions.append(m.start())
|
||||
|
||||
if not match_positions:
|
||||
# Nothing at all — take from the start
|
||||
truncated = full_text[:max_chars]
|
||||
suffix = "\n\n...[later conversation truncated]..." if max_chars < len(full_text) else ""
|
||||
return truncated + suffix
|
||||
|
||||
# --- Pick window that covers the most match positions ---------------------
|
||||
match_positions.sort()
|
||||
|
||||
best_start = 0
|
||||
best_count = 0
|
||||
for candidate in match_positions:
|
||||
ws = max(0, candidate - max_chars // 4) # bias: 25% before, 75% after
|
||||
we = ws + max_chars
|
||||
if we > len(full_text):
|
||||
ws = max(0, len(full_text) - max_chars)
|
||||
we = len(full_text)
|
||||
count = sum(1 for p in match_positions if ws <= p < we)
|
||||
if count > best_count:
|
||||
best_count = count
|
||||
best_start = ws
|
||||
|
||||
start = best_start
|
||||
end = min(len(full_text), start + max_chars)
|
||||
if end - start < max_chars:
|
||||
start = max(0, end - max_chars)
|
||||
|
||||
truncated = full_text[start:end]
|
||||
prefix = "...[earlier conversation truncated]...\n\n" if start > 0 else ""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue