mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-01 01:51:44 +00:00
Adds a term-based inverted index (term_index table, schema v7) that eliminates LLM summarization from the default search path. The fast path returns session metadata and match counts in ~1ms vs 10-15s for the full FTS5+LLM pipeline. Key changes: - term_index table: (term, message_id, session_id) WITHOUT ROWID for clustered B-tree lookups. Populated at write time in append_message (best-effort, never blocks inserts). - stop_words.py: 179-word NLTK English stop list, no stemming - term_index.py: extract_terms() for term extraction - session_search_tool.py: fast=True default, _fast_search for term index path, _full_search preserves original behavior, CJK query fallback to slow path - Auto-reindex on v7 migration: _init_schema returns needs_reindex flag, __init__ calls reindex_term_index() after migration - Swap strategy for reindex: builds into temp table, then atomic swap in single transaction (no empty-index window) - get_child_session_ids(): public API replacing db._lock/db._conn access in _fast_search - mode field in search results: 'fast' or 'full' - Cascade deletes: clear_messages, delete_session, prune_sessions all clean term_index entries Benchmarks on production DB (47.7 MB, 29,435 messages): - Term index reindex: 1,152,587 entries from 29,435 messages in 4s - Fast path: 1-4ms (no LLM) - Slow path: 10,000-16,000ms (FTS5 + LLM summarization) - Speedup: 4,000-15,000x on full round-trip 195 tests passing (48 term_index + 149 hermes_state). 12 regression tests from red-team QA covering: param binding, child session resolution, cascade deletes, CJK fallback.
44 lines
No EOL
1.4 KiB
Python
44 lines
No EOL
1.4 KiB
Python
"""Term index — inverted index extraction for session search fast path.
|
|
|
|
Extracts non-stop-word terms from message content for insertion into the
|
|
term_index table in SessionDB. Terms are lowercased, punctuation-stripped
|
|
(with preservation of path-like strings), and deduplicated per message.
|
|
"""
|
|
|
|
import re
|
|
from stop_words import is_stop_word
|
|
|
|
# Matches "words" including paths (foo/bar), filenames (file.py), and
|
|
# hyphenated terms (self-hosted). Filters out most punctuation but
|
|
# preserves dots in filenames and slashes in paths.
|
|
# Strategy: split on whitespace first, then strip leading/trailing punctuation.
|
|
_TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")
|
|
|
|
|
|
def extract_terms(content: str) -> list[str]:
|
|
"""Extract non-stop-word terms from message content.
|
|
|
|
Returns a deduplicated, lowercased list of terms.
|
|
Stops words, pure punctuation, and empty strings are excluded.
|
|
"""
|
|
if not content:
|
|
return []
|
|
|
|
# Find candidate tokens
|
|
raw_tokens = _TERM_RE.findall(content)
|
|
|
|
seen = set()
|
|
terms = []
|
|
for token in raw_tokens:
|
|
lower = token.lower()
|
|
# Skip stop words
|
|
if is_stop_word(lower):
|
|
continue
|
|
# Skip single characters except meaningful ones
|
|
# (but these are already handled by stop words for 'a', 'I', etc.)
|
|
# Deduplicate within this message
|
|
if lower not in seen:
|
|
seen.add(lower)
|
|
terms.append(lower)
|
|
|
|
return terms |