mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-26 06:01:49 +00:00
feat: add term_index inverted index for instant session search
Adds a term-based inverted index (term_index table, schema v7) that eliminates LLM summarization from the default search path. The fast path returns session metadata and match counts in ~1ms vs 10-15s for the full FTS5+LLM pipeline. Key changes: - term_index table: (term, message_id, session_id) WITHOUT ROWID for clustered B-tree lookups. Populated at write time in append_message (best-effort, never blocks inserts). - stop_words.py: 179-word NLTK English stop list, no stemming - term_index.py: extract_terms() for term extraction - session_search_tool.py: fast=True default, _fast_search for term index path, _full_search preserves original behavior, CJK query fallback to slow path - Auto-reindex on v7 migration: _init_schema returns needs_reindex flag, __init__ calls reindex_term_index() after migration - Swap strategy for reindex: builds into temp table, then atomic swap in single transaction (no empty-index window) - get_child_session_ids(): public API replacing db._lock/db._conn access in _fast_search - mode field in search results: 'fast' or 'full' - Cascade deletes: clear_messages, delete_session, prune_sessions all clean term_index entries Benchmarks on production DB (47.7 MB, 29,435 messages): - Term index reindex: 1,152,587 entries from 29,435 messages in 4s - Fast path: 1-4ms (no LLM) - Slow path: 10,000-16,000ms (FTS5 + LLM summarization) - Speedup: 4,000-15,000x on full round-trip 195 tests passing (48 term_index + 149 hermes_state). 12 regression tests from red-team QA covering: param binding, child session resolution, cascade deletes, CJK fallback.
This commit is contained in:
parent
de1a3922ed
commit
410456c599
6 changed files with 1097 additions and 15 deletions
44
term_index.py
Normal file
44
term_index.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
"""Term index — inverted index extraction for session search fast path.
|
||||
|
||||
Extracts non-stop-word terms from message content for insertion into the
|
||||
term_index table in SessionDB. Terms are lowercased, punctuation-stripped
|
||||
(with preservation of path-like strings), and deduplicated per message.
|
||||
"""
|
||||
|
||||
import re
|
||||
from stop_words import is_stop_word
|
||||
|
||||
# Matches "words" including paths (foo/bar), filenames (file.py), and
|
||||
# hyphenated terms (self-hosted). Filters out most punctuation but
|
||||
# preserves dots in filenames and slashes in paths.
|
||||
# Strategy: split on whitespace first, then strip leading/trailing punctuation.
|
||||
_TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")
|
||||
|
||||
|
||||
def extract_terms(content: str) -> list[str]:
|
||||
"""Extract non-stop-word terms from message content.
|
||||
|
||||
Returns a deduplicated, lowercased list of terms.
|
||||
Stops words, pure punctuation, and empty strings are excluded.
|
||||
"""
|
||||
if not content:
|
||||
return []
|
||||
|
||||
# Find candidate tokens
|
||||
raw_tokens = _TERM_RE.findall(content)
|
||||
|
||||
seen = set()
|
||||
terms = []
|
||||
for token in raw_tokens:
|
||||
lower = token.lower()
|
||||
# Skip stop words
|
||||
if is_stop_word(lower):
|
||||
continue
|
||||
# Skip single characters except meaningful ones
|
||||
# (but these are already handled by stop words for 'a', 'I', etc.)
|
||||
# Deduplicate within this message
|
||||
if lower not in seen:
|
||||
seen.add(lower)
|
||||
terms.append(lower)
|
||||
|
||||
return terms
|
||||
Loading…
Add table
Add a link
Reference in a new issue