feat: reduce term index noise — skip tool messages, filter numerics and JSON keys

Tool-role messages (44% of corpus) produce junk terms from structured
JSON output: field names like 'output', 'exit_code', 'error', 'status'
and numeric values like '0', '1', '42'. These have zero search value and
account for ~77% of index rows.

Changes:
- stop_words.py: add JSON key stop words (output, exit_code, error, null,
  true, false, status, content, message, cleared, success) and
  is_noise_term() that also filters pure numeric tokens
- term_index.py: switch from is_stop_word to is_noise_term
- hermes_state.py: skip tool-role messages at insert and reindex paths
- tests: 6 new TestNoiseReduction tests

Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
This commit is contained in:
AJ 2026-04-22 14:39:29 -04:00
parent 410456c599
commit 94f1758742
4 changed files with 169 additions and 12 deletions

View file

@ -1076,7 +1076,9 @@ class SessionDB:
# Insert terms into inverted index (delayed import avoids
# circular dependency — hermes_state is imported by nearly
# everything at startup, term_index must not be top-level)
if content:
# Skip tool-role messages: their structured JSON output produces
# noise terms (field names, numeric values) with no search value.
if content and role != "tool":
try:
from term_index import extract_terms
terms = extract_terms(content)
@ -1605,7 +1607,7 @@ class SessionDB:
# Read batch outside write lock
with self._lock:
cursor = self._conn.execute(
"SELECT id, session_id, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
"SELECT id, session_id, role, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
(batch_size, offset),
)
rows = cursor.fetchall()
@ -1613,11 +1615,14 @@ class SessionDB:
if not rows:
break
# Extract terms for the batch
# Extract terms for the batch, skipping tool-role messages
entries = []
for row in rows:
msg_id = row["id"]
session_id = row["session_id"]
# Skip tool messages — structured JSON output produces noise terms
if row["role"] == "tool":
continue
content = row["content"] or ""
terms = extract_terms(content)
for term in terms:

View file

@ -1,14 +1,18 @@
"""Stop word list for term index extraction.
Uses the well-known NLTK English stop word list (179 words) as a baseline.
Uses the well-known NLTK English stop word list (179 words) as a baseline,
plus common JSON schema keys from tool output and pure-numeric filter.
This module is self-contained -- no external dependencies.
"""
import re
# Standard English stop words (NLTK list, public domain)
# Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs,
# and common function words. Intentionally excludes short tech terms
# that overlap (e.g., "go", "it" as in IT/InfoTech handled by context).
_STOP_WORDS = frozenset(
_ENGLISH_STOP_WORDS = frozenset(
w.lower() for w in [
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
"your", "yours", "yourself", "yourselves", "he", "him", "his",
@ -31,12 +35,45 @@ _STOP_WORDS = frozenset(
]
)
# JSON schema keys that appear constantly in tool output.
# These are field names from structured tool responses, not semantic content.
# Nobody searches for "exit_code" to find a past session.
_JSON_KEY_STOP_WORDS = frozenset([
"output",
"exit_code",
"error",
"null",
"true",
"false",
"status",
"content",
"message",
"cleared",
"success",
])
# Combined stop word set
_STOP_WORDS = _ENGLISH_STOP_WORDS | _JSON_KEY_STOP_WORDS
# Pattern to detect pure numeric tokens (integers, floats, hex)
_NUMERIC_RE = re.compile(r"^[0-9]+$")
def is_stop_word(word: str) -> bool:
"""Check if a word is a stop word. Case-insensitive."""
return word.lower() in _STOP_WORDS
def is_noise_term(word: str) -> bool:
"""Check if a term is noise that should be excluded from the index.
This covers stop words AND pure numeric tokens, which provide zero
search value. Nobody searches for '0', '1', or '42' to find a session.
"""
lower = word.lower()
return lower in _STOP_WORDS or _NUMERIC_RE.match(lower) is not None
def get_stop_words() -> frozenset:
"""Return the full stop word set (for inspection/bulk use)."""
return _STOP_WORDS

View file

@ -3,10 +3,15 @@
Extracts non-stop-word terms from message content for insertion into the
term_index table in SessionDB. Terms are lowercased, punctuation-stripped
(with preservation of path-like strings), and deduplicated per message.
Noise filtering:
- English stop words (NLTK list)
- JSON schema keys from tool output (output, exit_code, error, etc.)
- Pure numeric tokens (0, 1, 42, etc.)
"""
import re
from stop_words import is_stop_word
from stop_words import is_noise_term
# Matches "words" including paths (foo/bar), filenames (file.py), and
# hyphenated terms (self-hosted). Filters out most punctuation but
@ -16,10 +21,10 @@ _TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")
def extract_terms(content: str) -> list[str]:
"""Extract non-stop-word terms from message content.
"""Extract non-noise terms from message content.
Returns a deduplicated, lowercased list of terms.
Stops words, pure punctuation, and empty strings are excluded.
Stop words, JSON keys, pure numerics, and empty strings are excluded.
"""
if not content:
return []
@ -31,11 +36,9 @@ def extract_terms(content: str) -> list[str]:
terms = []
for token in raw_tokens:
lower = token.lower()
# Skip stop words
if is_stop_word(lower):
# Skip noise: stop words, JSON keys, pure numerics
if is_noise_term(lower):
continue
# Skip single characters except meaningful ones
# (but these are already handled by stop words for 'a', 'I', etc.)
# Deduplicate within this message
if lower not in seen:
seen.add(lower)

View file

@ -562,6 +562,118 @@ class TestGetChildSessionIds:
assert children == ["child"]
class TestNoiseReduction:
"""Tests for noise reduction in term indexing.
Tool-role messages (structured JSON output) produce junk terms like
'output', 'exit_code', 'null', 'true', 'false'. Pure numeric tokens
('0', '1', '2') are never useful search targets. JSON key names that
appear in tool output schemas should be treated as stop words.
"""
def test_tool_role_messages_not_indexed(self, db):
"""Tool-role messages should be skipped entirely during indexing."""
db.create_session(session_id="s1", source="cli")
db.append_message(
session_id="s1",
role="tool",
content='{"output": "docker is running", "exit_code": 0}',
tool_name="terminal",
)
# Tool output should NOT index any terms from the JSON blob
# Even though 'docker' appears in the output string, it's inside
# structured JSON from a tool call, not natural language
cursor = db._conn.execute(
"SELECT COUNT(*) FROM term_index WHERE session_id = 's1'"
)
assert cursor.fetchone()[0] == 0
def test_assistant_role_still_indexed(self, db):
"""Non-tool messages should still be indexed normally."""
db.create_session(session_id="s1", source="cli")
db.append_message(session_id="s1", role="user", content="docker deploy")
db.append_message(
session_id="s1", role="assistant", content="docker is now running"
)
results = db.search_by_terms(["docker"])
assert len(results) >= 1
def test_pure_numeric_tokens_filtered(self):
"""Pure numeric tokens should be excluded from term extraction."""
from term_index import extract_terms
terms = extract_terms("exit code 0 with 42 errors in 123 steps")
# These numeric tokens provide zero search value
for num in ["0", "42", "123"]:
assert num not in terms, f"Pure numeric '{num}' should be filtered"
# But word tokens should survive
assert "exit" in terms
assert "code" in terms
assert "errors" in terms
assert "steps" in terms
def test_json_key_stopwords_filtered(self):
"""Common JSON schema keys from tool output should be stop words."""
from stop_words import is_stop_word
json_keys = [
"output",
"exit_code",
"error",
"null",
"true",
"false",
"status",
"content",
"message",
"cleared",
"success",
]
for key in json_keys:
assert is_stop_word(key), f"JSON key '{key}' should be a stop word"
def test_json_key_stopwords_in_extract_terms(self):
"""JSON key stop words should be filtered by extract_terms."""
from term_index import extract_terms
# Simulates typical tool output content
terms = extract_terms(
'{"output": "hello world", "exit_code": 0, "error": null, "success": true}'
)
for junk in ["output", "exit_code", "error", "null", "success", "true", "false"]:
assert junk not in terms, f"JSON key '{junk}' should be filtered"
# Actual content words should survive
assert "hello" in terms
assert "world" in terms
def test_reindex_skips_tool_messages(self, db):
"""reindex_term_index should not index tool-role messages."""
db.create_session(session_id="s1", source="cli")
db.append_message(session_id="s1", role="user", content="deploy docker")
db.append_message(
session_id="s1",
role="tool",
content='{"output": "docker running", "exit_code": 0}',
)
# Clear and reindex
db._conn.execute("DELETE FROM term_index")
db._conn.commit()
db.reindex_term_index()
# Tool message terms should not be in index
cursor = db._conn.execute(
"SELECT term FROM term_index WHERE session_id = 's1'"
)
indexed_terms = [row[0] for row in cursor.fetchall()]
for junk in ["output", "exit_code", "0"]:
assert junk not in indexed_terms, f"'{junk}' should not be indexed from tool messages"
class TestCJKFallbackInFastSearch:
"""CJK queries should fall through to the slow path even when fast=True.