mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix: merge dotted+hyphenated FTS5 quoting into single pass
The original PR applied dotted and hyphenated regex quoting in two sequential steps. For terms with both dots and hyphens (e.g. my-app.config.ts), step 2 would re-match inside already-quoted output, producing malformed double-quoted FTS5 syntax. Merged into a single regex pass: \w+(?:[.-]\w+)+ — handles dots, hyphens, and mixed terms in one shot. Added test coverage for the mixed case.
This commit is contained in:
parent
9825cd7b1e
commit
e4db72ef39
2 changed files with 14 additions and 13 deletions
|
|
@ -1009,8 +1009,9 @@ class SessionDB:
|
|||
Strategy:
|
||||
- Preserve properly paired quoted phrases (``"exact phrase"``)
|
||||
- Strip unmatched FTS5-special characters that would cause errors
|
||||
- Wrap unquoted hyphenated terms in quotes so FTS5 matches them
|
||||
as exact phrases instead of splitting on the hyphen
|
||||
- Wrap unquoted hyphenated and dotted terms in quotes so FTS5
|
||||
matches them as exact phrases instead of splitting on the
|
||||
hyphen/dot (e.g. ``chat-send``, ``P2.2``, ``my-app.config.ts``)
|
||||
"""
|
||||
# Step 1: Extract balanced double-quoted phrases and protect them
|
||||
# from further processing via numbered placeholders.
|
||||
|
|
@ -1035,17 +1036,13 @@ class SessionDB:
|
|||
sanitized = re.sub(r"(?i)^(AND|OR|NOT)\b\s*", "", sanitized.strip())
|
||||
sanitized = re.sub(r"(?i)\s+(AND|OR|NOT)\s*$", "", sanitized.strip())
|
||||
|
||||
# Step 5: Wrap unquoted dotted terms (e.g. ``P2.2``, ``simulate.p2.test.ts``)
|
||||
# in double quotes. In practice, FTS5 query parsing can treat dots as
|
||||
# syntax boundaries, which may produce OperationalError or zero results.
|
||||
# Quoting forces phrase semantics and avoids query parse edge cases.
|
||||
sanitized = re.sub(r"\b([\w-]+(?:\.[\w-]+)+)\b", r'"\1"', sanitized)
|
||||
|
||||
# Step 6: Wrap unquoted hyphenated terms (e.g. ``chat-send``) in
|
||||
# double quotes. FTS5's tokenizer splits on hyphens, turning
|
||||
# ``chat-send`` into ``chat AND send``. Quoting preserves the
|
||||
# intended phrase match.
|
||||
sanitized = re.sub(r"\b(\w+(?:-\w+)+)\b", r'"\1"', sanitized)
|
||||
# Step 5: Wrap unquoted dotted and/or hyphenated terms in double
|
||||
# quotes. FTS5's tokenizer splits on dots and hyphens, turning
|
||||
# ``chat-send`` into ``chat AND send`` and ``P2.2`` into ``p2 AND 2``.
|
||||
# Quoting preserves phrase semantics. A single pass avoids the
|
||||
# double-quoting bug that would occur if dotted and hyphenated
|
||||
# patterns were applied sequentially (e.g. ``my-app.config``).
|
||||
sanitized = re.sub(r"\b(\w+(?:[.-]\w+)+)\b", r'"\1"', sanitized)
|
||||
|
||||
# Step 6: Restore preserved quoted phrases
|
||||
for i, quoted in enumerate(_quoted_parts):
|
||||
|
|
|
|||
|
|
@ -474,6 +474,10 @@ class TestFTS5Search:
|
|||
assert '"P2.2"' in result
|
||||
assert '"simulate.p2"' in result
|
||||
|
||||
# Mixed dots and hyphens — single pass avoids double-quoting
|
||||
assert s('my-app.config') == '"my-app.config"'
|
||||
assert s('my-app.config.ts') == '"my-app.config.ts"'
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Session search and listing
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue