mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-17 04:31:55 +00:00
feat: reduce term index noise — skip tool messages, filter numerics and JSON keys
Tool-role messages (44% of corpus) produce junk terms from structured JSON output: field names like 'output', 'exit_code', 'error', 'status' and numeric values like '0', '1', '42'. These have zero search value and account for ~77% of index rows. Changes: - stop_words.py: add JSON key stop words (output, exit_code, error, null, true, false, status, content, message, cleared, success) and is_noise_term() that also filters pure numeric tokens - term_index.py: switch from is_stop_word to is_noise_term - hermes_state.py: skip tool-role messages at insert and reindex paths - tests: 6 new TestNoiseReduction tests Impact on live DB: 1.19M -> 278K rows (77% reduction), 1.5s reindex.
This commit is contained in:
parent
410456c599
commit
94f1758742
4 changed files with 169 additions and 12 deletions
|
|
@ -1076,7 +1076,9 @@ class SessionDB:
|
|||
# Insert terms into inverted index (delayed import avoids
|
||||
# circular dependency — hermes_state is imported by nearly
|
||||
# everything at startup, term_index must not be top-level)
|
||||
if content:
|
||||
# Skip tool-role messages: their structured JSON output produces
|
||||
# noise terms (field names, numeric values) with no search value.
|
||||
if content and role != "tool":
|
||||
try:
|
||||
from term_index import extract_terms
|
||||
terms = extract_terms(content)
|
||||
|
|
@ -1605,7 +1607,7 @@ class SessionDB:
|
|||
# Read batch outside write lock
|
||||
with self._lock:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT id, session_id, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
|
||||
"SELECT id, session_id, role, content FROM messages ORDER BY id LIMIT ? OFFSET ?",
|
||||
(batch_size, offset),
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
|
|
@ -1613,11 +1615,14 @@ class SessionDB:
|
|||
if not rows:
|
||||
break
|
||||
|
||||
# Extract terms for the batch
|
||||
# Extract terms for the batch, skipping tool-role messages
|
||||
entries = []
|
||||
for row in rows:
|
||||
msg_id = row["id"]
|
||||
session_id = row["session_id"]
|
||||
# Skip tool messages — structured JSON output produces noise terms
|
||||
if row["role"] == "tool":
|
||||
continue
|
||||
content = row["content"] or ""
|
||||
terms = extract_terms(content)
|
||||
for term in terms:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue