mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-24 05:41:40 +00:00
feat: reduce term index noise — skip tool messages, filter numerics and JSON keys
Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
This commit is contained in:
parent
410456c599
commit
94f1758742
4 changed files with 169 additions and 12 deletions
|
|
@ -1,14 +1,18 @@
|
|||
"""Stop word list for term index extraction.
|
||||
|
||||
Uses the well-known NLTK English stop word list (179 words) as a baseline.
|
||||
Uses the well-known NLTK English stop word list (179 words) as a baseline,
|
||||
plus common JSON schema keys from tool output and pure-numeric filter.
|
||||
|
||||
This module is self-contained -- no external dependencies.
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
# Standard English stop words (NLTK list, public domain)
|
||||
# Covers articles, conjunctions, prepositions, pronouns, auxiliary verbs,
|
||||
# and common function words. Intentionally excludes short tech terms
|
||||
# that overlap (e.g., "go", "it" as in IT/InfoTech handled by context).
|
||||
_STOP_WORDS = frozenset(
|
||||
_ENGLISH_STOP_WORDS = frozenset(
|
||||
w.lower() for w in [
|
||||
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
|
||||
"your", "yours", "yourself", "yourselves", "he", "him", "his",
|
||||
|
|
@ -31,12 +35,45 @@ _STOP_WORDS = frozenset(
|
|||
]
|
||||
)
|
||||
|
||||
# JSON schema keys that appear constantly in tool output.
|
||||
# These are field names from structured tool responses, not semantic content.
|
||||
# Nobody searches for "exit_code" to find a past session.
|
||||
_JSON_KEY_STOP_WORDS = frozenset([
|
||||
"output",
|
||||
"exit_code",
|
||||
"error",
|
||||
"null",
|
||||
"true",
|
||||
"false",
|
||||
"status",
|
||||
"content",
|
||||
"message",
|
||||
"cleared",
|
||||
"success",
|
||||
])
|
||||
|
||||
# Combined stop word set
|
||||
_STOP_WORDS = _ENGLISH_STOP_WORDS | _JSON_KEY_STOP_WORDS
|
||||
|
||||
# Pattern to detect pure numeric tokens (integers, floats, hex)
|
||||
_NUMERIC_RE = re.compile(r"^[0-9]+$")
|
||||
|
||||
|
||||
def is_stop_word(word: str) -> bool:
|
||||
"""Check if a word is a stop word. Case-insensitive."""
|
||||
return word.lower() in _STOP_WORDS
|
||||
|
||||
|
||||
def is_noise_term(word: str) -> bool:
|
||||
"""Check if a term is noise that should be excluded from the index.
|
||||
|
||||
This covers stop words AND pure numeric tokens, which provide zero
|
||||
search value. Nobody searches for '0', '1', or '42' to find a session.
|
||||
"""
|
||||
lower = word.lower()
|
||||
return lower in _STOP_WORDS or _NUMERIC_RE.match(lower) is not None
|
||||
|
||||
|
||||
def get_stop_words() -> frozenset:
|
||||
"""Return the full stop word set (for inspection/bulk use)."""
|
||||
return _STOP_WORDS
|
||||
Loading…
Add table
Add a link
Reference in a new issue