feat: reduce term index noise — skip tool messages, filter numerics and JSON keys

Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
2026-05-24 05:41:40 +00:00 · 2026-04-22 14:39:29 -04:00 · 2026-04-22 14:39:29 -04:00 · 94f1758742
commit 94f1758742
parent 410456c599
4 changed files with 169 additions and 12 deletions
--- a/stop_words.py
+++ b/stop_words.py
@ -1,14 +1,18 @@
 """Stop word list for term index extraction.

-Uses the well-known NLTK English stop word list (179 words) as a baseline.
+Uses the well-known NLTK English stop word list (179 words) as a baseline,
+plus common JSON schema keys from tool output and pure-numeric filter.
+
 This module is self-contained -- no external dependencies.
 """

+import re
+
 # Standard English stop words (NLTK list, public domain)
 # Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs,
 # and common function words. Intentionally excludes short tech terms
 # that overlap (e.g., "go", "it" as in IT/InfoTech handled by context).
-_STOP_WORDS = frozenset(
+_ENGLISH_STOP_WORDS = frozenset(
    w.lower() for w in [
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
        "your", "yours", "yourself", "yourselves", "he", "him", "his",
@ -31,12 +35,45 @@ _STOP_WORDS = frozenset(
    ]
 )

+# JSON schema keys that appear constantly in tool output.
+# These are field names from structured tool responses, not semantic content.
+# Nobody searches for "exit_code" to find a past session.
+_JSON_KEY_STOP_WORDS = frozenset([
+    "output",
+    "exit_code",
+    "error",
+    "null",
+    "true",
+    "false",
+    "status",
+    "content",
+    "message",
+    "cleared",
+    "success",
+])
+
+# Combined stop word set
+_STOP_WORDS = _ENGLISH_STOP_WORDS | _JSON_KEY_STOP_WORDS
+
+# Pattern to detect pure numeric tokens (integers, floats, hex)
+_NUMERIC_RE = re.compile(r"^[0-9]+$")
+

 def is_stop_word(word: str) -> bool:
    """Check if a word is a stop word. Case-insensitive."""
    return word.lower() in _STOP_WORDS


+def is_noise_term(word: str) -> bool:
+    """Check if a term is noise that should be excluded from the index.
+
+    This covers stop words AND pure numeric tokens, which provide zero
+    search value. Nobody searches for '0', '1', or '42' to find a session.
+    """
+    lower = word.lower()
+    return lower in _STOP_WORDS or _NUMERIC_RE.match(lower) is not None
+
+
 def get_stop_words() -> frozenset:
    """Return the full stop word set (for inspection/bulk use)."""
    return _STOP_WORDS