fix(curator): prevent false-positive consolidation from substring matching

_classify_removed_skills used naive 'in' substring matching to detect
whether a removed skill's name appeared in skill_manage arguments.
Short/common skill names (api, git, test, foo, etc.) matched
incorrectly when they appeared as substrings of longer words in file
paths (references/api-design.md) or content (latest, testing).

Replace with field-aware matching:
- file_path: needle must match a complete filename stem or directory
  name, with -/_ normalised for variant tolerance
- content fields: word-boundary regex (\b) prevents embedding in
  longer words

Also add 3 regression tests covering the false-positive scenarios.
This commit is contained in:
daixin1204 2026-05-03 21:02:03 +08:00 committed by Teknium
parent c0300575c1
commit 744079ffe6
2 changed files with 112 additions and 4 deletions

View file

@ -24,6 +24,7 @@ from __future__ import annotations
import json
import logging
import os
import re
import tempfile
import threading
from datetime import datetime, timedelta, timezone
@ -469,6 +470,24 @@ def _reports_root() -> Path:
return root
def _needle_in_path_component(needle: str, path: str) -> bool:
"""Check if *needle* is a complete filename stem or directory name in *path*.
Unlike simple substring matching, this avoids false positives where short
skill names are embedded in longer filenames (e.g. "api" matching
"references/api-design.md"). Hyphens and underscores are normalised so
"open-webui-setup" matches "open_webui_setup.md".
"""
norm_needle = needle.replace("-", "_")
for part in path.replace("\\", "/").split("/"):
if not part:
continue
stem = part.rsplit(".", 1)[0] if "." in part else part
if stem.replace("-", "_") == norm_needle:
return True
return False
def _classify_removed_skills(
removed: List[str],
added: List[str],
@ -547,15 +566,29 @@ def _classify_removed_skills(
continue
# Look for the removed skill's name in file_path / content / raw.
haystacks: List[str] = []
# Matching strategy differs by field type:
# file_path — needle must be a complete path component
# (filename stem or directory name), so "api" does NOT
# falsely match "references/api-design.md".
# content fields — word-boundary regex so "test" does NOT
# falsely match "latest" or "testing".
haystacks: List[tuple[str, str]] = []
for key in ("file_path", "file_content", "content", "new_string", "_raw"):
v = args.get(key)
if isinstance(v, str):
haystacks.append(v)
haystacks.append((key, v))
hit = False
for hay in haystacks:
for key, hay in haystacks:
for needle in needles:
if needle and needle in hay:
if not needle:
continue
if key == "file_path":
matched = _needle_in_path_component(needle, hay)
else:
matched = bool(
re.search(rf'\b{re.escape(needle)}\b', hay)
)
if matched:
hit = True
evidence = (
f"skill_manage action={args.get('action', '?')} "

View file

@ -220,6 +220,81 @@ def test_classify_handles_malformed_arguments_string(curator_env):
assert len(result["pruned"]) == 1
def test_classify_no_false_positive_short_name_in_file_path(curator_env):
"""Short skill name that is a substring of another filename = pruned, not consolidated."""
# e.g. "api" should NOT match "references/api-design.md"
result = curator_env._classify_removed_skills(
removed=["api"],
added=[],
after_names={"conventions"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "write_file",
"name": "conventions",
"file_path": "references/api-design.md",
"file_content": "# API Design\n...",
}),
},
],
)
assert result["consolidated"] == [], (
f"Short name 'api' should NOT match file_path 'references/api-design.md'"
)
assert len(result["pruned"]) == 1
assert result["pruned"][0]["name"] == "api"
def test_classify_no_false_positive_short_name_in_content(curator_env):
"""Short skill name embedded in longer word in content = pruned, not consolidated."""
# e.g. "test" should NOT match content "running latest tests"
result = curator_env._classify_removed_skills(
removed=["test"],
added=[],
after_names={"umbrella"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "patch",
"name": "umbrella",
"old_string": "old",
"new_string": "running latest tests with pytest",
}),
},
],
)
assert result["consolidated"] == [], (
f"Short name 'test' should NOT match 'latest' via word boundary"
)
assert len(result["pruned"]) == 1
def test_classify_still_matches_exact_word_in_content(curator_env):
"""Word-boundary match still works for exact word occurrences."""
# "api" SHOULD match content "use the api gateway"
result = curator_env._classify_removed_skills(
removed=["api"],
added=[],
after_names={"gateway"},
tool_calls=[
{
"name": "skill_manage",
"arguments": json.dumps({
"action": "edit",
"name": "gateway",
"content": "# Gateway\n\nUse the api gateway for all requests.\n",
}),
},
],
)
assert len(result["consolidated"]) == 1, (
f"'api' should match as a standalone word in content"
)
assert result["consolidated"][0]["into"] == "gateway"
def test_report_md_splits_consolidated_and_pruned_sections(curator_env):
"""End-to-end: REPORT.md shows both sections distinctly."""
curator = curator_env