mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
feat: reduce term index noise — skip tool messages, filter numerics and JSON keys
Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
This commit is contained in:
parent
410456c599
commit
94f1758742
4 changed files with 169 additions and 12 deletions
|
|
@ -1076,7 +1076,9 @@ class SessionDB:
|
|||
# Insert terms into inverted index (delayed import avoids
|
||||
# circular dependency — hermes_state is imported by nearly
|
||||
# everything at startup, term_index must not be top-level)
|
||||
if content:
|
||||
# Skip tool-role messages: their structured JSON output produces
|
||||
# noise terms (field names, numeric values) with no search value.
|
||||
if content and role != "tool":
|
||||
try:
|
||||
from term_index import extract_terms
|
||||
terms = extract_terms(content)
|
||||
|
|
@ -1605,7 +1607,7 @@ class SessionDB:
|
|||
# Read batch outside write lock
|
||||
with self._lock:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT id, session_id, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
|
||||
"SELECT id, session_id, role, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
|
||||
(batch_size, offset),
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
|
|
@ -1613,11 +1615,14 @@ class SessionDB:
|
|||
if not rows:
|
||||
break
|
||||
|
||||
# Extract terms for the batch
|
||||
# Extract terms for the batch, skipping tool-role messages
|
||||
entries = []
|
||||
for row in rows:
|
||||
msg_id = row["id"]
|
||||
session_id = row["session_id"]
|
||||
# Skip tool messages — structured JSON output produces noise terms
|
||||
if row["role"] == "tool":
|
||||
continue
|
||||
content = row["content"] or ""
|
||||
terms = extract_terms(content)
|
||||
for term in terms:
|
||||
|
|
|
|||
|
|
@ -1,14 +1,18 @@
|
|||
"""Stop word list for term index extraction.
|
||||
|
||||
Uses the well-known NLTK English stop word list (179 words) as a baseline.
|
||||
Uses the well-known NLTK English stop word list (179 words) as a baseline,
|
||||
plus common JSON schema keys from tool output and pure-numeric filter.
|
||||
|
||||
This module is self-contained -- no external dependencies.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Standard English stop words (NLTK list, public domain)
|
||||
# Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs,
|
||||
# and common function words. Intentionally excludes short tech terms
|
||||
# that overlap (e.g., "go", "it" as in IT/InfoTech handled by context).
|
||||
_STOP_WORDS = frozenset(
|
||||
_ENGLISH_STOP_WORDS = frozenset(
|
||||
w.lower() for w in [
|
||||
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
|
||||
"your", "yours", "yourself", "yourselves", "he", "him", "his",
|
||||
|
|
@ -31,12 +35,45 @@ _STOP_WORDS = frozenset(
|
|||
]
|
||||
)
|
||||
|
||||
# JSON schema keys that appear constantly in tool output.
|
||||
# These are field names from structured tool responses, not semantic content.
|
||||
# Nobody searches for "exit_code" to find a past session.
|
||||
_JSON_KEY_STOP_WORDS = frozenset([
|
||||
"output",
|
||||
"exit_code",
|
||||
"error",
|
||||
"null",
|
||||
"true",
|
||||
"false",
|
||||
"status",
|
||||
"content",
|
||||
"message",
|
||||
"cleared",
|
||||
"success",
|
||||
])
|
||||
|
||||
# Combined stop word set
|
||||
_STOP_WORDS = _ENGLISH_STOP_WORDS | _JSON_KEY_STOP_WORDS
|
||||
|
||||
# Pattern to detect pure numeric tokens (integers, floats, hex)
|
||||
_NUMERIC_RE = re.compile(r"^[0-9]+$")
|
||||
|
||||
|
||||
def is_stop_word(word: str) -> bool:
|
||||
"""Check if a word is a stop word. Case-insensitive."""
|
||||
return word.lower() in _STOP_WORDS
|
||||
|
||||
|
||||
def is_noise_term(word: str) -> bool:
|
||||
"""Check if a term is noise that should be excluded from the index.
|
||||
|
||||
This covers stop words AND pure numeric tokens, which provide zero
|
||||
search value. Nobody searches for '0', '1', or '42' to find a session.
|
||||
"""
|
||||
lower = word.lower()
|
||||
return lower in _STOP_WORDS or _NUMERIC_RE.match(lower) is not None
|
||||
|
||||
|
||||
def get_stop_words() -> frozenset:
|
||||
"""Return the full stop word set (for inspection/bulk use)."""
|
||||
return _STOP_WORDS
|
||||
|
|
@ -3,10 +3,15 @@
|
|||
Extracts non-stop-word terms from message content for insertion into the
|
||||
term_index table in SessionDB. Terms are lowercased, punctuation-stripped
|
||||
(with preservation of path-like strings), and deduplicated per message.
|
||||
|
||||
Noise filtering:
|
||||
- English stop words (NLTK list)
|
||||
- JSON schema keys from tool output (output, exit_code, error, etc.)
|
||||
- Pure numeric tokens (0, 1, 42, etc.)
|
||||
"""
|
||||
|
||||
import re
|
||||
from stop_words import is_stop_word
|
||||
from stop_words import is_noise_term
|
||||
|
||||
# Matches "words" including paths (foo/bar), filenames (file.py), and
|
||||
# hyphenated terms (self-hosted). Filters out most punctuation but
|
||||
|
|
@ -16,10 +21,10 @@ _TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")
|
|||
|
||||
|
||||
def extract_terms(content: str) -> list[str]:
|
||||
"""Extract non-stop-word terms from message content.
|
||||
"""Extract non-noise terms from message content.
|
||||
|
||||
Returns a deduplicated, lowercased list of terms.
|
||||
Stops words, pure punctuation, and empty strings are excluded.
|
||||
Stop words, JSON keys, pure numerics, and empty strings are excluded.
|
||||
"""
|
||||
if not content:
|
||||
return []
|
||||
|
|
@ -31,11 +36,9 @@ def extract_terms(content: str) -> list[str]:
|
|||
terms = []
|
||||
for token in raw_tokens:
|
||||
lower = token.lower()
|
||||
# Skip stop words
|
||||
if is_stop_word(lower):
|
||||
# Skip noise: stop words, JSON keys, pure numerics
|
||||
if is_noise_term(lower):
|
||||
continue
|
||||
# Skip single characters except meaningful ones
|
||||
# (but these are already handled by stop words for 'a', 'I', etc.)
|
||||
# Deduplicate within this message
|
||||
if lower not in seen:
|
||||
seen.add(lower)
|
||||
|
|
|
|||
|
|
@ -562,6 +562,118 @@ class TestGetChildSessionIds:
|
|||
assert children == ["child"]
|
||||
|
||||
|
||||
class TestNoiseReduction:
|
||||
"""Tests for noise reduction in term indexing.
|
||||
|
||||
Tool-role messages (structured JSON output) produce junk terms like
|
||||
'output', 'exit_code', 'null', 'true', 'false'. Pure numeric tokens
|
||||
('0', '1', '2') are never useful search targets. JSON key names that
|
||||
appear in tool output schemas should be treated as stop words.
|
||||
"""
|
||||
|
||||
def test_tool_role_messages_not_indexed(self, db):
|
||||
"""Tool-role messages should be skipped entirely during indexing."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message(
|
||||
session_id="s1",
|
||||
role="tool",
|
||||
content='{"output": "docker is running", "exit_code": 0}',
|
||||
tool_name="terminal",
|
||||
)
|
||||
|
||||
# Tool output should NOT index any terms from the JSON blob
|
||||
# Even though 'docker' appears in the output string, it's inside
|
||||
# structured JSON from a tool call, not natural language
|
||||
cursor = db._conn.execute(
|
||||
"SELECT COUNT(*) FROM term_index WHERE session_id = 's1'"
|
||||
)
|
||||
assert cursor.fetchone()[0] == 0
|
||||
|
||||
def test_assistant_role_still_indexed(self, db):
|
||||
"""Non-tool messages should still be indexed normally."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message(session_id="s1", role="user", content="docker deploy")
|
||||
db.append_message(
|
||||
session_id="s1", role="assistant", content="docker is now running"
|
||||
)
|
||||
|
||||
results = db.search_by_terms(["docker"])
|
||||
assert len(results) >= 1
|
||||
|
||||
def test_pure_numeric_tokens_filtered(self):
|
||||
"""Pure numeric tokens should be excluded from term extraction."""
|
||||
from term_index import extract_terms
|
||||
|
||||
terms = extract_terms("exit code 0 with 42 errors in 123 steps")
|
||||
# These numeric tokens provide zero search value
|
||||
for num in ["0", "42", "123"]:
|
||||
assert num not in terms, f"Pure numeric '{num}' should be filtered"
|
||||
|
||||
# But word tokens should survive
|
||||
assert "exit" in terms
|
||||
assert "code" in terms
|
||||
assert "errors" in terms
|
||||
assert "steps" in terms
|
||||
|
||||
def test_json_key_stopwords_filtered(self):
|
||||
"""Common JSON schema keys from tool output should be stop words."""
|
||||
from stop_words import is_stop_word
|
||||
|
||||
json_keys = [
|
||||
"output",
|
||||
"exit_code",
|
||||
"error",
|
||||
"null",
|
||||
"true",
|
||||
"false",
|
||||
"status",
|
||||
"content",
|
||||
"message",
|
||||
"cleared",
|
||||
"success",
|
||||
]
|
||||
for key in json_keys:
|
||||
assert is_stop_word(key), f"JSON key '{key}' should be a stop word"
|
||||
|
||||
def test_json_key_stopwords_in_extract_terms(self):
|
||||
"""JSON key stop words should be filtered by extract_terms."""
|
||||
from term_index import extract_terms
|
||||
|
||||
# Simulates typical tool output content
|
||||
terms = extract_terms(
|
||||
'{"output": "hello world", "exit_code": 0, "error": null, "success": true}'
|
||||
)
|
||||
for junk in ["output", "exit_code", "error", "null", "success", "true", "false"]:
|
||||
assert junk not in terms, f"JSON key '{junk}' should be filtered"
|
||||
|
||||
# Actual content words should survive
|
||||
assert "hello" in terms
|
||||
assert "world" in terms
|
||||
|
||||
def test_reindex_skips_tool_messages(self, db):
|
||||
"""reindex_term_index should not index tool-role messages."""
|
||||
db.create_session(session_id="s1", source="cli")
|
||||
db.append_message(session_id="s1", role="user", content="deploy docker")
|
||||
db.append_message(
|
||||
session_id="s1",
|
||||
role="tool",
|
||||
content='{"output": "docker running", "exit_code": 0}',
|
||||
)
|
||||
|
||||
# Clear and reindex
|
||||
db._conn.execute("DELETE FROM term_index")
|
||||
db._conn.commit()
|
||||
db.reindex_term_index()
|
||||
|
||||
# Tool message terms should not be in index
|
||||
cursor = db._conn.execute(
|
||||
"SELECT term FROM term_index WHERE session_id = 's1'"
|
||||
)
|
||||
indexed_terms = [row[0] for row in cursor.fetchall()]
|
||||
for junk in ["output", "exit_code", "0"]:
|
||||
assert junk not in indexed_terms, f"'{junk}' should not be indexed from tool messages"
|
||||
|
||||
|
||||
class TestCJKFallbackInFastSearch:
|
||||
"""CJK queries should fall through to the slow path even when fast=True.
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue