fix: prevent holographic auto-extract from storing raw turns

2026-04-25 00:51:20 +00:00 · 2026-04-24 23:58:34 +00:00 · 2026-04-24 23:58:34 +00:00 · 02867489eb
commit 02867489eb
parent 6fdbf2f2d7
2 changed files with 220 additions and 30 deletions
--- a/plugins/memory/holographic/init.py
+++ b/plugins/memory/holographic/init.py
@ -30,6 +30,153 @@ from .retrieval import FactRetriever
 logger = logging.getLogger(__name__)


+_AUTO_EXTRACT_MAX_SOURCE_CHARS = 800
+_AUTO_EXTRACT_MAX_CLAUSE_CHARS = 180
+_AUTO_EXTRACT_MAX_FACT_CHARS = 260
+
+_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+")
+_TASK_REQUEST_RE = re.compile(
+    r"\b("
+    r"can you|could you|would you|please|let'?s|lets|"
+    r"look|fix|install|research|build|make|update|run|check|"
+    r"tell me|show me|go ahead|keep going|continue"
+    r")\b",
+    re.IGNORECASE,
+)
+_UNSTABLE_REFERENCE_RE = re.compile(
+    r"\b(this|that|it|stuff|thing|something|anything|everything|here|there|now|today|tomorrow|yesterday)\b",
+    re.IGNORECASE,
+)
+
+
+def _clean_auto_extract_clause(value: str) -> str:
+    value = re.sub(r"\s+", " ", str(value or "")).strip(" \t\r\n\"'`.,;:")
+    value = re.sub(r"^(?:that|to)\s+", "", value, flags=re.IGNORECASE).strip()
+    return value[:_AUTO_EXTRACT_MAX_CLAUSE_CHARS].strip(" \t\r\n\"'`.,;:")
+
+
+def _is_stable_auto_extract_clause(value: str) -> bool:
+    value = _clean_auto_extract_clause(value)
+    if len(value) < 3:
+        return False
+    if "?" in value or "\n" in value:
+        return False
+    if _TASK_REQUEST_RE.search(value):
+        return False
+    if re.search(r"\byou(?:r|rs|self)?\b", value, re.IGNORECASE):
+        return False
+    if _UNSTABLE_REFERENCE_RE.fullmatch(value):
+        return False
+    return True
+
+
+def _format_auto_extract_fact(template: str, *parts: str) -> str | None:
+    cleaned = [_clean_auto_extract_clause(part) for part in parts]
+    if not all(_is_stable_auto_extract_clause(part) for part in cleaned):
+        return None
+    fact = template.format(*cleaned)
+    fact = re.sub(r"\s+", " ", fact).strip()
+    if not fact.endswith("."):
+        fact += "."
+    if len(fact) > _AUTO_EXTRACT_MAX_FACT_CHARS:
+        return None
+    return fact
+
+
+def _auto_extract_facts_from_text(content: str) -> list[tuple[str, str]]:
+    """Return distilled ``(fact, category)`` pairs from a user message.
+
+    This extractor is deliberately conservative. It should never store raw user
+    turns; it only keeps short normalized facts with stable anchors.
+    """
+    if not isinstance(content, str):
+        return []
+    text = re.sub(r"\s+", " ", content.strip())
+    if len(text) < 10 or len(text) > _AUTO_EXTRACT_MAX_SOURCE_CHARS:
+        return []
+
+    # Questions and explicit task requests are usually active-work context, not
+    # durable memory. Avoid storing snippets from "can you..." / "I need..." turns.
+    if "?" in text or _TASK_REQUEST_RE.search(text):
+        return []
+
+    patterns: list[tuple[re.Pattern[str], str, Any]] = [
+        (
+            re.compile(r"\bI\s+prefer\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "user_pref",
+            lambda m: _format_auto_extract_fact("User prefers {}.", m.group("value")),
+        ),
+        (
+            re.compile(r"\bI\s+(?:like|love)\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "user_pref",
+            lambda m: _format_auto_extract_fact("User likes {}.", m.group("value")),
+        ),
+        (
+            re.compile(r"\bI\s+use\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "user_pref",
+            lambda m: _format_auto_extract_fact("User uses {}.", m.group("value")),
+        ),
+        (
+            re.compile(r"\bI\s+(?:always|usually)\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "user_pref",
+            lambda m: _format_auto_extract_fact("User usually {}.", m.group("value")),
+        ),
+        (
+            re.compile(r"\bI\s+never\s+(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "user_pref",
+            lambda m: _format_auto_extract_fact("User never {}.", m.group("value")),
+        ),
+        (
+            re.compile(
+                r"\bmy\s+(?P<kind>favorite|preferred|default)\s+(?P<thing>[A-Za-z0-9 _/-]{2,60})\s+is\s+(?P<value>[^.!?\n]+)",
+                re.IGNORECASE,
+            ),
+            "user_pref",
+            lambda m: _format_auto_extract_fact(
+                "User's {} {} is {}.",
+                m.group("kind").lower(),
+                m.group("thing"),
+                m.group("value"),
+            ),
+        ),
+        (
+            re.compile(r"\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(?P<value>[^.!?\n]+)", re.IGNORECASE),
+            "project",
+            lambda m: _format_auto_extract_fact("Project decision: {}.", m.group("value")),
+        ),
+        (
+            re.compile(
+                r"\b(?P<subject>the project|hermes|the repo|the codebase)\s+(?P<verb>uses|needs|requires)\s+(?P<value>[^.!?\n]+)",
+                re.IGNORECASE,
+            ),
+            "project",
+            lambda m: _format_auto_extract_fact(
+                "{} {} {}.",
+                m.group("subject").capitalize(),
+                m.group("verb").lower(),
+                m.group("value"),
+            ),
+        ),
+    ]
+
+    extracted: list[tuple[str, str]] = []
+    seen: set[str] = set()
+    for sentence in _SENTENCE_SPLIT_RE.split(text):
+        sentence = sentence.strip()
+        if not sentence or len(sentence) > 320 or "?" in sentence:
+            continue
+        for pattern, category, render in patterns:
+            match = pattern.search(sentence)
+            if not match:
+                continue
+            fact = render(match)
+            if fact and fact.lower() not in seen:
+                seen.add(fact.lower())
+                extracted.append((fact, category))
+            break
+    return extracted
+
+
 # ---------------------------------------------------------------------------
 # Tool schemas (unchanged from original PR)
 # ---------------------------------------------------------------------------
@ -356,41 +503,21 @@ class HolographicMemoryProvider(MemoryProvider):
    # -- Auto-extraction (on_session_end) ------------------------------------

    def _auto_extract_facts(self, messages: list) -> None:
-        _PREF_PATTERNS = [
-            re.compile(r'\bI\s+(?:prefer|like|love|use|want|need)\s+(.+)', re.IGNORECASE),
-            re.compile(r'\bmy\s+(?:favorite|preferred|default)\s+\w+\s+is\s+(.+)', re.IGNORECASE),
-            re.compile(r'\bI\s+(?:always|never|usually)\s+(.+)', re.IGNORECASE),
-        ]
-        _DECISION_PATTERNS = [
-            re.compile(r'\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(.+)', re.IGNORECASE),
-            re.compile(r'\bthe\s+project\s+(?:uses|needs|requires)\s+(.+)', re.IGNORECASE),
-        ]
-
        extracted = 0
+        seen: set[str] = set()
        for msg in messages:
            if msg.get("role") != "user":
                continue
            content = msg.get("content", "")
-            if not isinstance(content, str) or len(content) < 10:
-                continue
-
-            for pattern in _PREF_PATTERNS:
-                if pattern.search(content):
-                    try:
-                        self._store.add_fact(content[:400], category="user_pref")
-                        extracted += 1
-                    except Exception:
-                        pass
-                    break
-
-            for pattern in _DECISION_PATTERNS:
-                if pattern.search(content):
-                    try:
-                        self._store.add_fact(content[:400], category="project")
-                        extracted += 1
-                    except Exception:
-                        pass
-                    break
+            for fact, category in _auto_extract_facts_from_text(content):
+                if fact.lower() in seen:
+                    continue
+                seen.add(fact.lower())
+                try:
+                    self._store.add_fact(fact, category=category)
+                    extracted += 1
+                except Exception:
+                    pass

        if extracted:
            logger.info("Auto-extracted %d facts from conversation", extracted)
--- a/tests/plugins/memory/test_holographic_provider.py
+++ b/tests/plugins/memory/test_holographic_provider.py
@ -0,0 +1,63 @@
+from plugins.memory.holographic import (
+    HolographicMemoryProvider,
+    _auto_extract_facts_from_text,
+)
+
+
+def _provider(tmp_path, **config):
+    p = HolographicMemoryProvider(
+        config={
+            "auto_extract": True,
+            "db_path": str(tmp_path / "memory_store.db"),
+            "hrr_weight": 0.0,
+            **config,
+        }
+    )
+    p.initialize(session_id="test-session")
+    return p
+
+
+def _stored_facts(provider):
+    return [
+        row["content"]
+        for row in provider._store.list_facts(min_trust=0.0, limit=50)
+    ]
+
+
+def test_auto_extract_distills_preference_instead_of_storing_raw_turn(tmp_path):
+    raw = "I prefer concise direct answers."
+    p = _provider(tmp_path)
+
+    p.on_session_end([{"role": "user", "content": raw}])
+
+    facts = _stored_facts(p)
+    assert facts == ["User prefers concise direct answers."]
+    assert raw not in facts
+
+
+def test_auto_extract_distills_project_facts(tmp_path):
+    p = _provider(tmp_path)
+
+    p.on_session_end(
+        [
+            {"role": "user", "content": "We decided to use broker transport for Claude CLI."},
+            {"role": "user", "content": "The project uses pytest for regression tests."},
+        ]
+    )
+
+    assert _stored_facts(p) == [
+        "Project decision: use broker transport for Claude CLI.",
+        "The project uses pytest for regression tests.",
+    ]
+
+
+def test_auto_extract_rejects_task_requests_and_questions():
+    raw_turns = [
+        "can you install it for me? I want to try using enhanced mode?",
+        "what are you talking about? I need to pay for it?",
+        "lets do serious stuff here: I want you to fully research the terminal issue",
+    ]
+
+    for raw in raw_turns:
+        assert _auto_extract_facts_from_text(raw) == []
+