mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: handle hyphenated FTS5 queries and preserve quoted literals (#1776)
_sanitize_fts5_query() was stripping ALL double quotes (including properly paired ones), breaking user-provided quoted phrases like "exact phrase". Hyphenated terms like chat-send also silently expanded to chat AND send, returning unexpected or zero results. Fix: 1. Extract balanced quoted phrases into placeholders before stripping FTS5-special characters, then restore them. 2. Wrap unquoted hyphenated terms (word-word) in double quotes so FTS5 matches them as exact phrases instead of splitting on the hyphen. 3. Unmatched quotes are still stripped as before. Based on issue report by @bailob (#1770) and PR #1773 by @Jah-yee (whose branch contained unrelated changes and couldn't be merged directly). Closes #1770 Closes #1773 Co-authored-by: Jah-yee <Jah-yee@users.noreply.github.com>
This commit is contained in:
parent
d5af593769
commit
d7a2e3ddae
2 changed files with 95 additions and 10 deletions
|
|
@ -689,21 +689,45 @@ class SessionDB:
|
||||||
``NOT``) have special meaning. Passing raw user input directly to
|
``NOT``) have special meaning. Passing raw user input directly to
|
||||||
MATCH can cause ``sqlite3.OperationalError``.
|
MATCH can cause ``sqlite3.OperationalError``.
|
||||||
|
|
||||||
Strategy: strip characters that are only meaningful as FTS5 operators
|
Strategy:
|
||||||
and would otherwise cause syntax errors. This preserves normal keyword
|
- Preserve properly paired quoted phrases (``"exact phrase"``)
|
||||||
search while preventing crashes on inputs like ``C++``, ``"unterminated``,
|
- Strip unmatched FTS5-special characters that would cause errors
|
||||||
or ``hello AND``.
|
- Wrap unquoted hyphenated terms in quotes so FTS5 matches them
|
||||||
|
as exact phrases instead of splitting on the hyphen
|
||||||
"""
|
"""
|
||||||
# Remove FTS5-special characters that are not useful in keyword search
|
# Step 1: Extract balanced double-quoted phrases and protect them
|
||||||
sanitized = re.sub(r'[+{}()"^]', " ", query)
|
# from further processing via numbered placeholders.
|
||||||
# Collapse repeated * (e.g. "***") into a single one, and remove
|
_quoted_parts: list = []
|
||||||
# leading * (prefix-only matching requires at least one char before *)
|
|
||||||
|
def _preserve_quoted(m: re.Match) -> str:
|
||||||
|
_quoted_parts.append(m.group(0))
|
||||||
|
return f"\x00Q{len(_quoted_parts) - 1}\x00"
|
||||||
|
|
||||||
|
sanitized = re.sub(r'"[^"]*"', _preserve_quoted, query)
|
||||||
|
|
||||||
|
# Step 2: Strip remaining (unmatched) FTS5-special characters
|
||||||
|
sanitized = re.sub(r'[+{}()\"^]', " ", sanitized)
|
||||||
|
|
||||||
|
# Step 3: Collapse repeated * (e.g. "***") into a single one,
|
||||||
|
# and remove leading * (prefix-only needs at least one char before *)
|
||||||
sanitized = re.sub(r"\*+", "*", sanitized)
|
sanitized = re.sub(r"\*+", "*", sanitized)
|
||||||
sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
|
sanitized = re.sub(r"(^|\s)\*", r"\1", sanitized)
|
||||||
# Remove dangling boolean operators at start/end that would cause
|
|
||||||
# syntax errors (e.g. "hello AND" or "OR world")
|
# Step 4: Remove dangling boolean operators at start/end that would
|
||||||
|
# cause syntax errors (e.g. "hello AND" or "OR world")
|
||||||
sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
|
sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
|
||||||
sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
|
sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
|
||||||
|
|
||||||
|
# Step 5: Wrap unquoted hyphenated terms (e.g. ``chat-send``) in
|
||||||
|
# double quotes. FTS5's tokenizer splits on hyphens, turning
|
||||||
|
# ``chat-send`` into ``chat AND send``. Quoting preserves the
|
||||||
|
# intended phrase match.
|
||||||
|
sanitized = re.sub(r"\b(\w+(?:-\w+)+)\b", r'"\1"', sanitized)
|
||||||
|
|
||||||
|
# Step 6: Restore preserved quoted phrases
|
||||||
|
for i, quoted in enumerate(_quoted_parts):
|
||||||
|
sanitized = sanitized.replace(f"\x00Q{i}\x00", quoted)
|
||||||
|
|
||||||
return sanitized.strip()
|
return sanitized.strip()
|
||||||
|
|
||||||
def search_messages(
|
def search_messages(
|
||||||
|
|
|
||||||
|
|
@ -261,6 +261,30 @@ class TestFTS5Search:
|
||||||
# The word "C" appears in the content, so FTS5 should find it
|
# The word "C" appears in the content, so FTS5 should find it
|
||||||
assert isinstance(results, list)
|
assert isinstance(results, list)
|
||||||
|
|
||||||
|
def test_search_hyphenated_term_does_not_crash(self, db):
|
||||||
|
"""Hyphenated terms like 'chat-send' must not crash FTS5."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="Run the chat-send command")
|
||||||
|
|
||||||
|
results = db.search_messages("chat-send")
|
||||||
|
assert isinstance(results, list)
|
||||||
|
assert len(results) >= 1
|
||||||
|
assert any("chat-send" in (r.get("snippet") or r.get("content", "")).lower()
|
||||||
|
for r in results)
|
||||||
|
|
||||||
|
def test_search_quoted_phrase_preserved(self, db):
|
||||||
|
"""User-provided quoted phrases should be preserved for exact matching."""
|
||||||
|
db.create_session(session_id="s1", source="cli")
|
||||||
|
db.append_message("s1", role="user", content="docker networking is complex")
|
||||||
|
db.append_message("s1", role="assistant", content="networking docker tips")
|
||||||
|
|
||||||
|
# Quoted phrase should match only the exact order
|
||||||
|
results = db.search_messages('"docker networking"')
|
||||||
|
assert isinstance(results, list)
|
||||||
|
# Should find the user message (exact phrase) but may or may not find
|
||||||
|
# the assistant message depending on FTS5 phrase matching
|
||||||
|
assert len(results) >= 1
|
||||||
|
|
||||||
def test_sanitize_fts5_query_strips_dangerous_chars(self):
|
def test_sanitize_fts5_query_strips_dangerous_chars(self):
|
||||||
"""Unit test for _sanitize_fts5_query static method."""
|
"""Unit test for _sanitize_fts5_query static method."""
|
||||||
from hermes_state import SessionDB
|
from hermes_state import SessionDB
|
||||||
|
|
@ -278,6 +302,43 @@ class TestFTS5Search:
|
||||||
# Valid prefix kept
|
# Valid prefix kept
|
||||||
assert s('deploy*') == 'deploy*'
|
assert s('deploy*') == 'deploy*'
|
||||||
|
|
||||||
|
def test_sanitize_fts5_preserves_quoted_phrases(self):
|
||||||
|
"""Properly paired double-quoted phrases should be preserved."""
|
||||||
|
from hermes_state import SessionDB
|
||||||
|
s = SessionDB._sanitize_fts5_query
|
||||||
|
# Simple quoted phrase
|
||||||
|
assert s('"exact phrase"') == '"exact phrase"'
|
||||||
|
# Quoted phrase alongside unquoted terms
|
||||||
|
assert '"docker networking"' in s('"docker networking" setup')
|
||||||
|
# Multiple quoted phrases
|
||||||
|
result = s('"hello world" OR "foo bar"')
|
||||||
|
assert '"hello world"' in result
|
||||||
|
assert '"foo bar"' in result
|
||||||
|
# Unmatched quote still stripped
|
||||||
|
assert '"' not in s('"unterminated')
|
||||||
|
|
||||||
|
def test_sanitize_fts5_quotes_hyphenated_terms(self):
|
||||||
|
"""Hyphenated terms should be wrapped in quotes for exact matching."""
|
||||||
|
from hermes_state import SessionDB
|
||||||
|
s = SessionDB._sanitize_fts5_query
|
||||||
|
# Simple hyphenated term
|
||||||
|
assert s('chat-send') == '"chat-send"'
|
||||||
|
# Multiple hyphens
|
||||||
|
assert s('docker-compose-up') == '"docker-compose-up"'
|
||||||
|
# Hyphenated term with other words
|
||||||
|
result = s('fix chat-send bug')
|
||||||
|
assert '"chat-send"' in result
|
||||||
|
assert 'fix' in result
|
||||||
|
assert 'bug' in result
|
||||||
|
# Multiple hyphenated terms with OR
|
||||||
|
result = s('chat-send OR deploy-prod')
|
||||||
|
assert '"chat-send"' in result
|
||||||
|
assert '"deploy-prod"' in result
|
||||||
|
# Already-quoted hyphenated term — no double quoting
|
||||||
|
assert s('"chat-send"') == '"chat-send"'
|
||||||
|
# Hyphenated inside a quoted phrase stays as-is
|
||||||
|
assert s('"my chat-send thing"') == '"my chat-send thing"'
|
||||||
|
|
||||||
|
|
||||||
# =========================================================================
|
# =========================================================================
|
||||||
# Session search and listing
|
# Session search and listing
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue