mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-04-25 00:51:20 +00:00
fix(workspace): address code review findings on parser layer
- Cache MarkItDown instance across _convert() calls instead of recreating per file (perf during batch indexing) - Log warning when configured backend name is unknown in build_parser() - Normalize suffix casing in CompositeParser.can_parse() - Remove unused import in test file
This commit is contained in:
parent
11618e9928
commit
cf292d258a
2 changed files with 12 additions and 6 deletions
|
|
@ -1,4 +1,3 @@
|
|||
import subprocess
|
||||
import sys
|
||||
import types
|
||||
from pathlib import Path
|
||||
|
|
|
|||
|
|
@ -39,14 +39,18 @@ class FileParser(ABC):
|
|||
class MarkitdownParser(FileParser):
|
||||
name = "markitdown"
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._md: object | None = None
|
||||
|
||||
def supported_suffixes(self) -> frozenset[str]:
|
||||
return frozenset({".pdf", ".docx", ".pptx"})
|
||||
|
||||
def _convert(self, path: Path) -> str:
|
||||
from markitdown import MarkItDown
|
||||
if self._md is None:
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert(str(path))
|
||||
self._md = MarkItDown()
|
||||
result = self._md.convert(str(path)) # type: ignore[union-attr]
|
||||
return result.markdown
|
||||
|
||||
|
||||
|
|
@ -81,7 +85,7 @@ class CompositeParser:
|
|||
return parser.parse(path)
|
||||
|
||||
def can_parse(self, suffix: str) -> bool:
|
||||
return suffix in self._routing
|
||||
return suffix.lower() in self._routing
|
||||
|
||||
|
||||
_PARSER_CLASSES: list[type[FileParser]] = [MarkitdownParser, PandocParser]
|
||||
|
|
@ -97,7 +101,10 @@ def build_parser(config: ParsingConfig) -> CompositeParser:
|
|||
for suffix in PARSEABLE_SUFFIXES:
|
||||
backend_name = config.overrides.get(suffix, config.default)
|
||||
parser = available.get(backend_name)
|
||||
if parser is not None and suffix in parser.supported_suffixes():
|
||||
if parser is None:
|
||||
log.warning("Unknown parser backend %r for %s — skipping", backend_name, suffix)
|
||||
continue
|
||||
if suffix in parser.supported_suffixes():
|
||||
routing[suffix] = parser
|
||||
|
||||
return CompositeParser(routing)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue