diff --git a/hermes_state.py b/hermes_state.py index eebbb9d2d6..61a837febc 100644 --- a/hermes_state.py +++ b/hermes_state.py @@ -1076,7 +1076,9 @@ class SessionDB: # Insert terms into inverted index (delayed import avoids # circular dependency — hermes_state is imported by nearly # everything at startup, term_index must not be top-level) - if content: + # Skip tool-role messages: their structured JSON output produces + # noise terms (field names, numeric values) with no search value. + if content and role != "tool": try: from term_index import extract_terms terms = extract_terms(content) @@ -1605,7 +1607,7 @@ class SessionDB: # Read batch outside write lock with self._lock: cursor = self._conn.execute( - "SELECT id, session_id, content FROM messages ORDER BY id LIMIT ? OFFSET ?", + "SELECT id, session_id, role, content FROM messages ORDER BY id LIMIT ? OFFSET ?", (batch_size, offset), ) rows = cursor.fetchall() @@ -1613,11 +1615,14 @@ class SessionDB: if not rows: break - # Extract terms for the batch + # Extract terms for the batch, skipping tool-role messages entries = [] for row in rows: msg_id = row["id"] session_id = row["session_id"] + # Skip tool messages — structured JSON output produces noise terms + if row["role"] == "tool": + continue content = row["content"] or "" terms = extract_terms(content) for term in terms: diff --git a/stop_words.py b/stop_words.py index 2bf44578ad..d1346b4c25 100644 --- a/stop_words.py +++ b/stop_words.py @@ -1,14 +1,18 @@ """Stop word list for term index extraction. -Uses the well-known NLTK English stop word list (179 words) as a baseline. +Uses the well-known NLTK English stop word list (179 words) as a baseline, +plus common JSON schema keys from tool output and pure-numeric filter. + This module is self-contained -- no external dependencies. """ +import re + # Standard English stop words (NLTK list, public domain) # Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs, # and common function words. Intentionally excludes short tech terms # that overlap (e.g., "go", "it" as in IT/InfoTech handled by context). -_STOP_WORDS = frozenset( +_ENGLISH_STOP_WORDS = frozenset( w.lower() for w in [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", @@ -31,12 +35,45 @@ _STOP_WORDS = frozenset( ] ) +# JSON schema keys that appear constantly in tool output. +# These are field names from structured tool responses, not semantic content. +# Nobody searches for "exit_code" to find a past session. +_JSON_KEY_STOP_WORDS = frozenset([ + "output", + "exit_code", + "error", + "null", + "true", + "false", + "status", + "content", + "message", + "cleared", + "success", +]) + +# Combined stop word set +_STOP_WORDS = _ENGLISH_STOP_WORDS | _JSON_KEY_STOP_WORDS + +# Pattern to detect pure numeric tokens (integers, floats, hex) +_NUMERIC_RE = re.compile(r"^[0-9]+$") + def is_stop_word(word: str) -> bool: """Check if a word is a stop word. Case-insensitive.""" return word.lower() in _STOP_WORDS +def is_noise_term(word: str) -> bool: + """Check if a term is noise that should be excluded from the index. + + This covers stop words AND pure numeric tokens, which provide zero + search value. Nobody searches for '0', '1', or '42' to find a session. + """ + lower = word.lower() + return lower in _STOP_WORDS or _NUMERIC_RE.match(lower) is not None + + def get_stop_words() -> frozenset: """Return the full stop word set (for inspection/bulk use).""" return _STOP_WORDS \ No newline at end of file diff --git a/term_index.py b/term_index.py index 03ba1524ac..ebaa11781e 100644 --- a/term_index.py +++ b/term_index.py @@ -3,10 +3,15 @@ Extracts non-stop-word terms from message content for insertion into the term_index table in SessionDB. Terms are lowercased, punctuation-stripped (with preservation of path-like strings), and deduplicated per message. + +Noise filtering: + - English stop words (NLTK list) + - JSON schema keys from tool output (output, exit_code, error, etc.) + - Pure numeric tokens (0, 1, 42, etc.) """ import re -from stop_words import is_stop_word +from stop_words import is_noise_term # Matches "words" including paths (foo/bar), filenames (file.py), and # hyphenated terms (self-hosted). Filters out most punctuation but @@ -16,10 +21,10 @@ _TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]") def extract_terms(content: str) -> list[str]: - """Extract non-stop-word terms from message content. + """Extract non-noise terms from message content. Returns a deduplicated, lowercased list of terms. - Stops words, pure punctuation, and empty strings are excluded. + Stop words, JSON keys, pure numerics, and empty strings are excluded. """ if not content: return [] @@ -31,11 +36,9 @@ def extract_terms(content: str) -> list[str]: terms = [] for token in raw_tokens: lower = token.lower() - # Skip stop words - if is_stop_word(lower): + # Skip noise: stop words, JSON keys, pure numerics + if is_noise_term(lower): continue - # Skip single characters except meaningful ones - # (but these are already handled by stop words for 'a', 'I', etc.) # Deduplicate within this message if lower not in seen: seen.add(lower) diff --git a/tests/test_term_index.py b/tests/test_term_index.py index 46f76d4463..c2e8944ffa 100644 --- a/tests/test_term_index.py +++ b/tests/test_term_index.py @@ -562,6 +562,118 @@ class TestGetChildSessionIds: assert children == ["child"] +class TestNoiseReduction: + """Tests for noise reduction in term indexing. + + Tool-role messages (structured JSON output) produce junk terms like + 'output', 'exit_code', 'null', 'true', 'false'. Pure numeric tokens + ('0', '1', '2') are never useful search targets. JSON key names that + appear in tool output schemas should be treated as stop words. + """ + + def test_tool_role_messages_not_indexed(self, db): + """Tool-role messages should be skipped entirely during indexing.""" + db.create_session(session_id="s1", source="cli") + db.append_message( + session_id="s1", + role="tool", + content='{"output": "docker is running", "exit_code": 0}', + tool_name="terminal", + ) + + # Tool output should NOT index any terms from the JSON blob + # Even though 'docker' appears in the output string, it's inside + # structured JSON from a tool call, not natural language + cursor = db._conn.execute( + "SELECT COUNT(*) FROM term_index WHERE session_id = 's1'" + ) + assert cursor.fetchone()[0] == 0 + + def test_assistant_role_still_indexed(self, db): + """Non-tool messages should still be indexed normally.""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="docker deploy") + db.append_message( + session_id="s1", role="assistant", content="docker is now running" + ) + + results = db.search_by_terms(["docker"]) + assert len(results) >= 1 + + def test_pure_numeric_tokens_filtered(self): + """Pure numeric tokens should be excluded from term extraction.""" + from term_index import extract_terms + + terms = extract_terms("exit code 0 with 42 errors in 123 steps") + # These numeric tokens provide zero search value + for num in ["0", "42", "123"]: + assert num not in terms, f"Pure numeric '{num}' should be filtered" + + # But word tokens should survive + assert "exit" in terms + assert "code" in terms + assert "errors" in terms + assert "steps" in terms + + def test_json_key_stopwords_filtered(self): + """Common JSON schema keys from tool output should be stop words.""" + from stop_words import is_stop_word + + json_keys = [ + "output", + "exit_code", + "error", + "null", + "true", + "false", + "status", + "content", + "message", + "cleared", + "success", + ] + for key in json_keys: + assert is_stop_word(key), f"JSON key '{key}' should be a stop word" + + def test_json_key_stopwords_in_extract_terms(self): + """JSON key stop words should be filtered by extract_terms.""" + from term_index import extract_terms + + # Simulates typical tool output content + terms = extract_terms( + '{"output": "hello world", "exit_code": 0, "error": null, "success": true}' + ) + for junk in ["output", "exit_code", "error", "null", "success", "true", "false"]: + assert junk not in terms, f"JSON key '{junk}' should be filtered" + + # Actual content words should survive + assert "hello" in terms + assert "world" in terms + + def test_reindex_skips_tool_messages(self, db): + """reindex_term_index should not index tool-role messages.""" + db.create_session(session_id="s1", source="cli") + db.append_message(session_id="s1", role="user", content="deploy docker") + db.append_message( + session_id="s1", + role="tool", + content='{"output": "docker running", "exit_code": 0}', + ) + + # Clear and reindex + db._conn.execute("DELETE FROM term_index") + db._conn.commit() + db.reindex_term_index() + + # Tool message terms should not be in index + cursor = db._conn.execute( + "SELECT term FROM term_index WHERE session_id = 's1'" + ) + indexed_terms = [row[0] for row in cursor.fetchall()] + for junk in ["output", "exit_code", "0"]: + assert junk not in indexed_terms, f"'{junk}' should not be indexed from tool messages" + + class TestCJKFallbackInFastSearch: """CJK queries should fall through to the slow path even when fast=True.