mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-28 01:21:43 +00:00
Adds a term-based inverted index (term_index table, schema v7) that eliminates LLM summarization from the default search path. The fast path returns session metadata and match counts in ~1ms vs 10-15s for the full FTS5+LLM pipeline. Key changes: - term_index table: (term, message_id, session_id) WITHOUT ROWID for clustered B-tree lookups. Populated at write time in append_message (best-effort, never blocks inserts). - stop_words.py: 179-word NLTK English stop list, no stemming - term_index.py: extract_terms() for term extraction - session_search_tool.py: fast=True default, _fast_search for term index path, _full_search preserves original behavior, CJK query fallback to slow path - Auto-reindex on v7 migration: _init_schema returns needs_reindex flag, __init__ calls reindex_term_index() after migration - Swap strategy for reindex: builds into temp table, then atomic swap in single transaction (no empty-index window) - get_child_session_ids(): public API replacing db._lock/db._conn access in _fast_search - mode field in search results: 'fast' or 'full' - Cascade deletes: clear_messages, delete_session, prune_sessions all clean term_index entries Benchmarks on production DB (47.7 MB, 29,435 messages): - Term index reindex: 1,152,587 entries from 29,435 messages in 4s - Fast path: 1-4ms (no LLM) - Slow path: 10,000-16,000ms (FTS5 + LLM summarization) - Speedup: 4,000-15,000x on full round-trip 195 tests passing (48 term_index + 149 hermes_state). 12 regression tests from red-team QA covering: param binding, child session resolution, cascade deletes, CJK fallback.
42 lines
No EOL
2.1 KiB
Python
42 lines
No EOL
2.1 KiB
Python
"""Stop word list for term index extraction.
|
|
|
|
Uses the well-known NLTK English stop word list (179 words) as a baseline.
|
|
This module is self-contained -- no external dependencies.
|
|
"""
|
|
|
|
# Standard English stop words (NLTK list, public domain)
|
|
# Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs,
|
|
# and common function words. Intentionally excludes short tech terms
|
|
# that overlap (e.g., "go", "it" as in IT/InfoTech handled by context).
|
|
_STOP_WORDS = frozenset(
|
|
w.lower() for w in [
|
|
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
|
|
"your", "yours", "yourself", "yourselves", "he", "him", "his",
|
|
"himself", "she", "her", "hers", "herself", "it", "its", "itself",
|
|
"they", "them", "their", "theirs", "themselves", "what", "which",
|
|
"who", "whom", "this", "that", "these", "those", "am", "is", "are",
|
|
"was", "were", "be", "been", "being", "have", "has", "had", "having",
|
|
"do", "does", "did", "doing", "a", "an", "the", "and", "but", "if",
|
|
"or", "because", "as", "until", "while", "of", "at", "by", "for",
|
|
"with", "about", "against", "between", "through", "during", "before",
|
|
"after", "above", "below", "to", "from", "up", "down", "in", "out",
|
|
"on", "off", "over", "under", "again", "further", "then", "once",
|
|
"here", "there", "when", "where", "why", "how", "all", "both", "each",
|
|
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
|
"only", "own", "same", "so", "than", "too", "very", "s", "t", "can",
|
|
"will", "just", "don", "should", "now", "d", "ll", "m", "o", "re",
|
|
"ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn",
|
|
"haven", "isn", "ma", "mightn", "mustn", "needn", "shan", "shouldn",
|
|
"wasn", "weren", "won", "wouldn",
|
|
]
|
|
)
|
|
|
|
|
|
def is_stop_word(word: str) -> bool:
|
|
"""Check if a word is a stop word. Case-insensitive."""
|
|
return word.lower() in _STOP_WORDS
|
|
|
|
|
|
def get_stop_words() -> frozenset:
|
|
"""Return the full stop word set (for inspection/bulk use)."""
|
|
return _STOP_WORDS |