fix(workspace): address code review findings on parser layer

- Cache MarkItDown instance across _convert() calls instead of
  recreating per file (perf during batch indexing)
- Log warning when configured backend name is unknown in build_parser()
- Normalize suffix casing in CompositeParser.can_parse()
- Remove unused import in test file
This commit is contained in:
alt-glitch 2026-04-20 03:58:08 +05:30
parent 11618e9928
commit cf292d258a
2 changed files with 12 additions and 6 deletions

View file

@ -1,4 +1,3 @@
import subprocess
import sys
import types
from pathlib import Path

View file

@ -39,14 +39,18 @@ class FileParser(ABC):
class MarkitdownParser(FileParser):
name = "markitdown"
def __init__(self) -> None:
self._md: object | None = None
def supported_suffixes(self) -> frozenset[str]:
return frozenset({".pdf", ".docx", ".pptx"})
def _convert(self, path: Path) -> str:
from markitdown import MarkItDown
if self._md is None:
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(path))
self._md = MarkItDown()
result = self._md.convert(str(path)) # type: ignore[union-attr]
return result.markdown
@ -81,7 +85,7 @@ class CompositeParser:
return parser.parse(path)
def can_parse(self, suffix: str) -> bool:
return suffix in self._routing
return suffix.lower() in self._routing
_PARSER_CLASSES: list[type[FileParser]] = [MarkitdownParser, PandocParser]
@ -97,7 +101,10 @@ def build_parser(config: ParsingConfig) -> CompositeParser:
for suffix in PARSEABLE_SUFFIXES:
backend_name = config.overrides.get(suffix, config.default)
parser = available.get(backend_name)
if parser is not None and suffix in parser.supported_suffixes():
if parser is None:
log.warning("Unknown parser backend %r for %s — skipping", backend_name, suffix)
continue
if suffix in parser.supported_suffixes():
routing[suffix] = parser
return CompositeParser(routing)