mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: prevent holographic auto-extract from storing raw turns
This commit is contained in:
parent
6fdbf2f2d7
commit
02867489eb
2 changed files with 220 additions and 30 deletions
|
|
@ -30,6 +30,153 @@ from .retrieval import FactRetriever
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_AUTO_EXTRACT_MAX_SOURCE_CHARS = 800
|
||||
_AUTO_EXTRACT_MAX_CLAUSE_CHARS = 180
|
||||
_AUTO_EXTRACT_MAX_FACT_CHARS = 260
|
||||
|
||||
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
|
||||
_TASK_REQUEST_RE = re.compile(
|
||||
r"\b("
|
||||
r"can you|could you|would you|please|let'?s|lets|"
|
||||
r"look|fix|install|research|build|make|update|run|check|"
|
||||
r"tell me|show me|go ahead|keep going|continue"
|
||||
r")\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_UNSTABLE_REFERENCE_RE = re.compile(
|
||||
r"\b(this|that|it|stuff|thing|something|anything|everything|here|there|now|today|tomorrow|yesterday)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _clean_auto_extract_clause(value: str) -> str:
|
||||
value = re.sub(r"\s+", " ", str(value or "")).strip(" \t\r\n\"'`.,;:")
|
||||
value = re.sub(r"^(?:that|to)\s+", "", value, flags=re.IGNORECASE).strip()
|
||||
return value[:_AUTO_EXTRACT_MAX_CLAUSE_CHARS].strip(" \t\r\n\"'`.,;:")
|
||||
|
||||
|
||||
def _is_stable_auto_extract_clause(value: str) -> bool:
|
||||
value = _clean_auto_extract_clause(value)
|
||||
if len(value) < 3:
|
||||
return False
|
||||
if "?" in value or "\n" in value:
|
||||
return False
|
||||
if _TASK_REQUEST_RE.search(value):
|
||||
return False
|
||||
if re.search(r"\byou(?:r|rs|self)?\b", value, re.IGNORECASE):
|
||||
return False
|
||||
if _UNSTABLE_REFERENCE_RE.fullmatch(value):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _format_auto_extract_fact(template: str, *parts: str) -> str | None:
|
||||
cleaned = [_clean_auto_extract_clause(part) for part in parts]
|
||||
if not all(_is_stable_auto_extract_clause(part) for part in cleaned):
|
||||
return None
|
||||
fact = template.format(*cleaned)
|
||||
fact = re.sub(r"\s+", " ", fact).strip()
|
||||
if not fact.endswith("."):
|
||||
fact += "."
|
||||
if len(fact) > _AUTO_EXTRACT_MAX_FACT_CHARS:
|
||||
return None
|
||||
return fact
|
||||
|
||||
|
||||
def _auto_extract_facts_from_text(content: str) -> list[tuple[str, str]]:
|
||||
"""Return distilled ``(fact, category)`` pairs from a user message.
|
||||
|
||||
This extractor is deliberately conservative. It should never store raw user
|
||||
turns; it only keeps short normalized facts with stable anchors.
|
||||
"""
|
||||
if not isinstance(content, str):
|
||||
return []
|
||||
text = re.sub(r"\s+", " ", content.strip())
|
||||
if len(text) < 10 or len(text) > _AUTO_EXTRACT_MAX_SOURCE_CHARS:
|
||||
return []
|
||||
|
||||
# Questions and explicit task requests are usually active-work context, not
|
||||
# durable memory. Avoid storing snippets from "can you..." / "I need..." turns.
|
||||
if "?" in text or _TASK_REQUEST_RE.search(text):
|
||||
return []
|
||||
|
||||
patterns: list[tuple[re.Pattern[str], str, Any]] = [
|
||||
(
|
||||
re.compile(r"\bI\s+prefer\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact("User prefers {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(r"\bI\s+(?:like|love)\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact("User likes {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(r"\bI\s+use\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact("User uses {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(r"\bI\s+(?:always|usually)\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact("User usually {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(r"\bI\s+never\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact("User never {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"\bmy\s+(?P<kind>favorite|preferred|default)\s+(?P<thing>[A-Za-z0-9 _/-]{2,60})\s+is\s+(?P<value>[^.!?\n]+)",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
"user_pref",
|
||||
lambda m: _format_auto_extract_fact(
|
||||
"User's {} {} is {}.",
|
||||
m.group("kind").lower(),
|
||||
m.group("thing"),
|
||||
m.group("value"),
|
||||
),
|
||||
),
|
||||
(
|
||||
re.compile(r"\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(?P<value>[^.!?\n]+)", re.IGNORECASE),
|
||||
"project",
|
||||
lambda m: _format_auto_extract_fact("Project decision: {}.", m.group("value")),
|
||||
),
|
||||
(
|
||||
re.compile(
|
||||
r"\b(?P<subject>the project|hermes|the repo|the codebase)\s+(?P<verb>uses|needs|requires)\s+(?P<value>[^.!?\n]+)",
|
||||
re.IGNORECASE,
|
||||
),
|
||||
"project",
|
||||
lambda m: _format_auto_extract_fact(
|
||||
"{} {} {}.",
|
||||
m.group("subject").capitalize(),
|
||||
m.group("verb").lower(),
|
||||
m.group("value"),
|
||||
),
|
||||
),
|
||||
]
|
||||
|
||||
extracted: list[tuple[str, str]] = []
|
||||
seen: set[str] = set()
|
||||
for sentence in _SENTENCE_SPLIT_RE.split(text):
|
||||
sentence = sentence.strip()
|
||||
if not sentence or len(sentence) > 320 or "?" in sentence:
|
||||
continue
|
||||
for pattern, category, render in patterns:
|
||||
match = pattern.search(sentence)
|
||||
if not match:
|
||||
continue
|
||||
fact = render(match)
|
||||
if fact and fact.lower() not in seen:
|
||||
seen.add(fact.lower())
|
||||
extracted.append((fact, category))
|
||||
break
|
||||
return extracted
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool schemas (unchanged from original PR)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -356,41 +503,21 @@ class HolographicMemoryProvider(MemoryProvider):
|
|||
# -- Auto-extraction (on_session_end) ------------------------------------
|
||||
|
||||
def _auto_extract_facts(self, messages: list) -> None:
|
||||
_PREF_PATTERNS = [
|
||||
re.compile(r'\bI\s+(?:prefer|like|love|use|want|need)\s+(.+)', re.IGNORECASE),
|
||||
re.compile(r'\bmy\s+(?:favorite|preferred|default)\s+\w+\s+is\s+(.+)', re.IGNORECASE),
|
||||
re.compile(r'\bI\s+(?:always|never|usually)\s+(.+)', re.IGNORECASE),
|
||||
]
|
||||
_DECISION_PATTERNS = [
|
||||
re.compile(r'\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(.+)', re.IGNORECASE),
|
||||
re.compile(r'\bthe\s+project\s+(?:uses|needs|requires)\s+(.+)', re.IGNORECASE),
|
||||
]
|
||||
|
||||
extracted = 0
|
||||
seen: set[str] = set()
|
||||
for msg in messages:
|
||||
if msg.get("role") != "user":
|
||||
continue
|
||||
content = msg.get("content", "")
|
||||
if not isinstance(content, str) or len(content) < 10:
|
||||
continue
|
||||
|
||||
for pattern in _PREF_PATTERNS:
|
||||
if pattern.search(content):
|
||||
try:
|
||||
self._store.add_fact(content[:400], category="user_pref")
|
||||
extracted += 1
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
|
||||
for pattern in _DECISION_PATTERNS:
|
||||
if pattern.search(content):
|
||||
try:
|
||||
self._store.add_fact(content[:400], category="project")
|
||||
extracted += 1
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
for fact, category in _auto_extract_facts_from_text(content):
|
||||
if fact.lower() in seen:
|
||||
continue
|
||||
seen.add(fact.lower())
|
||||
try:
|
||||
self._store.add_fact(fact, category=category)
|
||||
extracted += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if extracted:
|
||||
logger.info("Auto-extracted %d facts from conversation", extracted)
|
||||
|
|
|
|||
63
tests/plugins/memory/test_holographic_provider.py
Normal file
63
tests/plugins/memory/test_holographic_provider.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
from plugins.memory.holographic import (
|
||||
HolographicMemoryProvider,
|
||||
_auto_extract_facts_from_text,
|
||||
)
|
||||
|
||||
|
||||
def _provider(tmp_path, **config):
|
||||
p = HolographicMemoryProvider(
|
||||
config={
|
||||
"auto_extract": True,
|
||||
"db_path": str(tmp_path / "memory_store.db"),
|
||||
"hrr_weight": 0.0,
|
||||
**config,
|
||||
}
|
||||
)
|
||||
p.initialize(session_id="test-session")
|
||||
return p
|
||||
|
||||
|
||||
def _stored_facts(provider):
|
||||
return [
|
||||
row["content"]
|
||||
for row in provider._store.list_facts(min_trust=0.0, limit=50)
|
||||
]
|
||||
|
||||
|
||||
def test_auto_extract_distills_preference_instead_of_storing_raw_turn(tmp_path):
|
||||
raw = "I prefer concise direct answers."
|
||||
p = _provider(tmp_path)
|
||||
|
||||
p.on_session_end([{"role": "user", "content": raw}])
|
||||
|
||||
facts = _stored_facts(p)
|
||||
assert facts == ["User prefers concise direct answers."]
|
||||
assert raw not in facts
|
||||
|
||||
|
||||
def test_auto_extract_distills_project_facts(tmp_path):
|
||||
p = _provider(tmp_path)
|
||||
|
||||
p.on_session_end(
|
||||
[
|
||||
{"role": "user", "content": "We decided to use broker transport for Claude CLI."},
|
||||
{"role": "user", "content": "The project uses pytest for regression tests."},
|
||||
]
|
||||
)
|
||||
|
||||
assert _stored_facts(p) == [
|
||||
"Project decision: use broker transport for Claude CLI.",
|
||||
"The project uses pytest for regression tests.",
|
||||
]
|
||||
|
||||
|
||||
def test_auto_extract_rejects_task_requests_and_questions():
|
||||
raw_turns = [
|
||||
"can you install it for me? I want to try using enhanced mode?",
|
||||
"what are you talking about? I need to pay for it?",
|
||||
"lets do serious stuff here: I want you to fully research the terminal issue",
|
||||
]
|
||||
|
||||
for raw in raw_turns:
|
||||
assert _auto_extract_facts_from_text(raw) == []
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue