diff --git a/plugins/memory/holographic/__init__.py b/plugins/memory/holographic/__init__.py index cd4ef07b4..217736043 100644 --- a/plugins/memory/holographic/__init__.py +++ b/plugins/memory/holographic/__init__.py @@ -30,6 +30,153 @@ from .retrieval import FactRetriever logger = logging.getLogger(__name__) +_AUTO_EXTRACT_MAX_SOURCE_CHARS = 800 +_AUTO_EXTRACT_MAX_CLAUSE_CHARS = 180 +_AUTO_EXTRACT_MAX_FACT_CHARS = 260 + +_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+|\n+") +_TASK_REQUEST_RE = re.compile( + r"\b(" + r"can you|could you|would you|please|let'?s|lets|" + r"look|fix|install|research|build|make|update|run|check|" + r"tell me|show me|go ahead|keep going|continue" + r")\b", + re.IGNORECASE, +) +_UNSTABLE_REFERENCE_RE = re.compile( + r"\b(this|that|it|stuff|thing|something|anything|everything|here|there|now|today|tomorrow|yesterday)\b", + re.IGNORECASE, +) + + +def _clean_auto_extract_clause(value: str) -> str: + value = re.sub(r"\s+", " ", str(value or "")).strip(" \t\r\n\"'`.,;:") + value = re.sub(r"^(?:that|to)\s+", "", value, flags=re.IGNORECASE).strip() + return value[:_AUTO_EXTRACT_MAX_CLAUSE_CHARS].strip(" \t\r\n\"'`.,;:") + + +def _is_stable_auto_extract_clause(value: str) -> bool: + value = _clean_auto_extract_clause(value) + if len(value) < 3: + return False + if "?" in value or "\n" in value: + return False + if _TASK_REQUEST_RE.search(value): + return False + if re.search(r"\byou(?:r|rs|self)?\b", value, re.IGNORECASE): + return False + if _UNSTABLE_REFERENCE_RE.fullmatch(value): + return False + return True + + +def _format_auto_extract_fact(template: str, *parts: str) -> str | None: + cleaned = [_clean_auto_extract_clause(part) for part in parts] + if not all(_is_stable_auto_extract_clause(part) for part in cleaned): + return None + fact = template.format(*cleaned) + fact = re.sub(r"\s+", " ", fact).strip() + if not fact.endswith("."): + fact += "." + if len(fact) > _AUTO_EXTRACT_MAX_FACT_CHARS: + return None + return fact + + +def _auto_extract_facts_from_text(content: str) -> list[tuple[str, str]]: + """Return distilled ``(fact, category)`` pairs from a user message. + + This extractor is deliberately conservative. It should never store raw user + turns; it only keeps short normalized facts with stable anchors. + """ + if not isinstance(content, str): + return [] + text = re.sub(r"\s+", " ", content.strip()) + if len(text) < 10 or len(text) > _AUTO_EXTRACT_MAX_SOURCE_CHARS: + return [] + + # Questions and explicit task requests are usually active-work context, not + # durable memory. Avoid storing snippets from "can you..." / "I need..." turns. + if "?" in text or _TASK_REQUEST_RE.search(text): + return [] + + patterns: list[tuple[re.Pattern[str], str, Any]] = [ + ( + re.compile(r"\bI\s+prefer\s+(?P[^.!?\n]+)", re.IGNORECASE), + "user_pref", + lambda m: _format_auto_extract_fact("User prefers {}.", m.group("value")), + ), + ( + re.compile(r"\bI\s+(?:like|love)\s+(?P[^.!?\n]+)", re.IGNORECASE), + "user_pref", + lambda m: _format_auto_extract_fact("User likes {}.", m.group("value")), + ), + ( + re.compile(r"\bI\s+use\s+(?P[^.!?\n]+)", re.IGNORECASE), + "user_pref", + lambda m: _format_auto_extract_fact("User uses {}.", m.group("value")), + ), + ( + re.compile(r"\bI\s+(?:always|usually)\s+(?P[^.!?\n]+)", re.IGNORECASE), + "user_pref", + lambda m: _format_auto_extract_fact("User usually {}.", m.group("value")), + ), + ( + re.compile(r"\bI\s+never\s+(?P[^.!?\n]+)", re.IGNORECASE), + "user_pref", + lambda m: _format_auto_extract_fact("User never {}.", m.group("value")), + ), + ( + re.compile( + r"\bmy\s+(?Pfavorite|preferred|default)\s+(?P[A-Za-z0-9 _/-]{2,60})\s+is\s+(?P[^.!?\n]+)", + re.IGNORECASE, + ), + "user_pref", + lambda m: _format_auto_extract_fact( + "User's {} {} is {}.", + m.group("kind").lower(), + m.group("thing"), + m.group("value"), + ), + ), + ( + re.compile(r"\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(?P[^.!?\n]+)", re.IGNORECASE), + "project", + lambda m: _format_auto_extract_fact("Project decision: {}.", m.group("value")), + ), + ( + re.compile( + r"\b(?Pthe project|hermes|the repo|the codebase)\s+(?Puses|needs|requires)\s+(?P[^.!?\n]+)", + re.IGNORECASE, + ), + "project", + lambda m: _format_auto_extract_fact( + "{} {} {}.", + m.group("subject").capitalize(), + m.group("verb").lower(), + m.group("value"), + ), + ), + ] + + extracted: list[tuple[str, str]] = [] + seen: set[str] = set() + for sentence in _SENTENCE_SPLIT_RE.split(text): + sentence = sentence.strip() + if not sentence or len(sentence) > 320 or "?" in sentence: + continue + for pattern, category, render in patterns: + match = pattern.search(sentence) + if not match: + continue + fact = render(match) + if fact and fact.lower() not in seen: + seen.add(fact.lower()) + extracted.append((fact, category)) + break + return extracted + + # --------------------------------------------------------------------------- # Tool schemas (unchanged from original PR) # --------------------------------------------------------------------------- @@ -356,41 +503,21 @@ class HolographicMemoryProvider(MemoryProvider): # -- Auto-extraction (on_session_end) ------------------------------------ def _auto_extract_facts(self, messages: list) -> None: - _PREF_PATTERNS = [ - re.compile(r'\bI\s+(?:prefer|like|love|use|want|need)\s+(.+)', re.IGNORECASE), - re.compile(r'\bmy\s+(?:favorite|preferred|default)\s+\w+\s+is\s+(.+)', re.IGNORECASE), - re.compile(r'\bI\s+(?:always|never|usually)\s+(.+)', re.IGNORECASE), - ] - _DECISION_PATTERNS = [ - re.compile(r'\bwe\s+(?:decided|agreed|chose)\s+(?:to\s+)?(.+)', re.IGNORECASE), - re.compile(r'\bthe\s+project\s+(?:uses|needs|requires)\s+(.+)', re.IGNORECASE), - ] - extracted = 0 + seen: set[str] = set() for msg in messages: if msg.get("role") != "user": continue content = msg.get("content", "") - if not isinstance(content, str) or len(content) < 10: - continue - - for pattern in _PREF_PATTERNS: - if pattern.search(content): - try: - self._store.add_fact(content[:400], category="user_pref") - extracted += 1 - except Exception: - pass - break - - for pattern in _DECISION_PATTERNS: - if pattern.search(content): - try: - self._store.add_fact(content[:400], category="project") - extracted += 1 - except Exception: - pass - break + for fact, category in _auto_extract_facts_from_text(content): + if fact.lower() in seen: + continue + seen.add(fact.lower()) + try: + self._store.add_fact(fact, category=category) + extracted += 1 + except Exception: + pass if extracted: logger.info("Auto-extracted %d facts from conversation", extracted) diff --git a/tests/plugins/memory/test_holographic_provider.py b/tests/plugins/memory/test_holographic_provider.py new file mode 100644 index 000000000..5d334b5c4 --- /dev/null +++ b/tests/plugins/memory/test_holographic_provider.py @@ -0,0 +1,63 @@ +from plugins.memory.holographic import ( + HolographicMemoryProvider, + _auto_extract_facts_from_text, +) + + +def _provider(tmp_path, **config): + p = HolographicMemoryProvider( + config={ + "auto_extract": True, + "db_path": str(tmp_path / "memory_store.db"), + "hrr_weight": 0.0, + **config, + } + ) + p.initialize(session_id="test-session") + return p + + +def _stored_facts(provider): + return [ + row["content"] + for row in provider._store.list_facts(min_trust=0.0, limit=50) + ] + + +def test_auto_extract_distills_preference_instead_of_storing_raw_turn(tmp_path): + raw = "I prefer concise direct answers." + p = _provider(tmp_path) + + p.on_session_end([{"role": "user", "content": raw}]) + + facts = _stored_facts(p) + assert facts == ["User prefers concise direct answers."] + assert raw not in facts + + +def test_auto_extract_distills_project_facts(tmp_path): + p = _provider(tmp_path) + + p.on_session_end( + [ + {"role": "user", "content": "We decided to use broker transport for Claude CLI."}, + {"role": "user", "content": "The project uses pytest for regression tests."}, + ] + ) + + assert _stored_facts(p) == [ + "Project decision: use broker transport for Claude CLI.", + "The project uses pytest for regression tests.", + ] + + +def test_auto_extract_rejects_task_requests_and_questions(): + raw_turns = [ + "can you install it for me? I want to try using enhanced mode?", + "what are you talking about? I need to pay for it?", + "lets do serious stuff here: I want you to fully research the terminal issue", + ] + + for raw in raw_turns: + assert _auto_extract_facts_from_text(raw) == [] +