mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
feat: reduce term index noise — skip tool messages, filter numerics and JSON keys
Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
This commit is contained in:
parent
410456c599
commit
94f1758742
4 changed files with 169 additions and 12 deletions
|
|
@ -3,10 +3,15 @@
|
|||
Extracts non-stop-word terms from message content for insertion into the
|
||||
term_index table in SessionDB. Terms are lowercased, punctuation-stripped
|
||||
(with preservation of path-like strings), and deduplicated per message.
|
||||
|
||||
Noise filtering:
|
||||
- English stop words (NLTK list)
|
||||
- JSON schema keys from tool output (output, exit_code, error, etc.)
|
||||
- Pure numeric tokens (0, 1, 42, etc.)
|
||||
"""
|
||||
|
||||
import re
|
||||
from stop_words import is_stop_word
|
||||
from stop_words import is_noise_term
|
||||
|
||||
# Matches "words" including paths (foo/bar), filenames (file.py), and
|
||||
# hyphenated terms (self-hosted). Filters out most punctuation but
|
||||
|
|
@ -16,10 +21,10 @@ _TERM_RE = re.compile(r"[a-zA-Z0-9][\w./\-]*[a-zA-Z0-9]|[a-zA-Z0-9]")
|
|||
|
||||
|
||||
def extract_terms(content: str) -> list[str]:
|
||||
"""Extract non-stop-word terms from message content.
|
||||
"""Extract non-noise terms from message content.
|
||||
|
||||
Returns a deduplicated, lowercased list of terms.
|
||||
Stops words, pure punctuation, and empty strings are excluded.
|
||||
Stop words, JSON keys, pure numerics, and empty strings are excluded.
|
||||
"""
|
||||
if not content:
|
||||
return []
|
||||
|
|
@ -31,11 +36,9 @@ def extract_terms(content: str) -> list[str]:
|
|||
terms = []
|
||||
for token in raw_tokens:
|
||||
lower = token.lower()
|
||||
# Skip stop words
|
||||
if is_stop_word(lower):
|
||||
# Skip noise: stop words, JSON keys, pure numerics
|
||||
if is_noise_term(lower):
|
||||
continue
|
||||
# Skip single characters except meaningful ones
|
||||
# (but these are already handled by stop words for 'a', 'I', etc.)
|
||||
# Deduplicate within this message
|
||||
if lower not in seen:
|
||||
seen.add(lower)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue