feat: reduce term index noise — skip tool messages, filter numerics and JSON keys

Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
2026-05-18 04:41:56 +00:00 · 2026-04-22 14:39:29 -04:00 · 2026-04-22 14:39:29 -04:00 · 94f1758742
commit 94f1758742
parent 410456c599
4 changed files with 169 additions and 12 deletions
--- a/term_index.py
+++ b/term_index.py
@ -3,10 +3,15 @@
 Extracts non-stop-word terms from message content for insertion into the
 term_index table in SessionDB. Terms are lowercased, punctuation-stripped
 (with preservation of path-like strings), and deduplicated per message.
+
+Noise filtering:
+  - English stop words (NLTK list)
+  - JSON schema keys from tool output (output, exit_code, error, etc.)
+  - Pure numeric tokens (0, 1, 42, etc.)
 """

 import re
-from stop_words import is_stop_word
+from stop_words import is_noise_term

 # Matches "words" including paths (foo/bar), filenames (file.py), and
 # hyphenated terms (self-hosted). Filters out most punctuation but
@ -16,10 +21,10 @@ _TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")


 def extract_terms(content: str) -> list[str]:
-    """Extract non-stop-word terms from message content.
+    """Extract non-noise terms from message content.

    Returns a deduplicated, lowercased list of terms.
-    Stops words, pure punctuation, and empty strings are excluded.
+    Stop words, JSON keys, pure numerics, and empty strings are excluded.
    """
    if not content:
        return []
@ -31,11 +36,9 @@ def extract_terms(content: str) -> list[str]:
    terms = []
    for token in raw_tokens:
        lower = token.lower()
-        # Skip stop words
-        if is_stop_word(lower):
+        # Skip noise: stop words, JSON keys, pure numerics
+        if is_noise_term(lower):
            continue
-        # Skip single characters except meaningful ones
-        # (but these are already handled by stop words for 'a', 'I', etc.)
        # Deduplicate within this message
        if lower not in seen:
            seen.add(lower)