mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-15 09:21:36 +00:00
Add stdlib-only extraction for `.ipynb`, `.docx`, and `.xlsx` in read_file with lazy integration and malformed-document fallback.
248 lines
8.3 KiB
Python
248 lines
8.3 KiB
Python
"""Stdlib document-to-text extraction for ``read_file``.
|
|
|
|
Supports Jupyter notebooks, DOCX, and XLSX without adding hard dependencies.
|
|
Malformed documents raise :class:`ExtractionError`; callers can then fall back to
|
|
normal text/binary handling.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import posixpath
|
|
import zipfile
|
|
from pathlib import Path
|
|
from xml.etree import ElementTree as ET
|
|
|
|
__all__ = ["EXTRACTABLE_EXTENSIONS", "ExtractionError", "extract_document_text", "is_extractable_document"]
|
|
|
|
EXTRACTABLE_EXTENSIONS = frozenset({".ipynb", ".docx", ".xlsx"})
|
|
MAX_XLSX_BYTES = 50 * 1024 * 1024
|
|
_MAX_XLSX_ROWS_PER_SHEET = 5000
|
|
_MAX_XLSX_COLS = 256
|
|
|
|
_NS_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
_NS_S = "http://schemas.openxmlformats.org/spreadsheetml/2006/main"
|
|
_NS_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
_NS_PKG_REL = "http://schemas.openxmlformats.org/package/2006/relationships"
|
|
|
|
|
|
class ExtractionError(Exception):
|
|
"""Raised when a supported-looking document cannot be rendered as text."""
|
|
|
|
|
|
def _extension(path: str) -> str:
|
|
ext = Path(path).suffix.lower()
|
|
return ext if ext in EXTRACTABLE_EXTENSIONS else ""
|
|
|
|
|
|
def is_extractable_document(path: str) -> bool:
|
|
return bool(_extension(path))
|
|
|
|
|
|
def extract_document_text(path: str) -> str:
|
|
ext = _extension(path)
|
|
if ext == ".ipynb":
|
|
return _extract_notebook(path)
|
|
if ext == ".docx":
|
|
return _extract_docx(path)
|
|
if ext == ".xlsx":
|
|
return _extract_xlsx(path)
|
|
raise ExtractionError(f"Unsupported document type: {path!r}")
|
|
|
|
|
|
def _source_text(source) -> str:
|
|
if isinstance(source, str):
|
|
return source
|
|
if isinstance(source, list):
|
|
return "".join(item for item in source if isinstance(item, str))
|
|
return ""
|
|
|
|
|
|
def _extract_notebook(path: str) -> str:
|
|
try:
|
|
with open(path, encoding="utf-8", errors="replace") as fh:
|
|
nb = json.load(fh)
|
|
except (OSError, ValueError, json.JSONDecodeError) as exc:
|
|
raise ExtractionError(f"Not a valid notebook: {exc}") from exc
|
|
if not isinstance(nb, dict):
|
|
raise ExtractionError("Notebook root is not an object")
|
|
|
|
cells = nb.get("cells")
|
|
if not isinstance(cells, list):
|
|
cells = [
|
|
cell
|
|
for ws in nb.get("worksheets", [])
|
|
if isinstance(ws, dict)
|
|
for cell in ws.get("cells", [])
|
|
]
|
|
if not cells:
|
|
raise ExtractionError("Notebook contains no cells")
|
|
|
|
counts = {"markdown": 0, "code": 0, "raw": 0}
|
|
labels = {"markdown": "Markdown", "code": "Code", "raw": "Raw"}
|
|
out: list[str] = []
|
|
for cell in cells:
|
|
if not isinstance(cell, dict):
|
|
continue
|
|
typ = cell.get("cell_type")
|
|
if typ not in labels:
|
|
continue
|
|
counts[typ] += 1
|
|
suffix = f" {counts[typ]}" if typ != "raw" else ""
|
|
out.extend((f"# ── {labels[typ]} cell{suffix} ──", _source_text(cell.get("source", "")).rstrip("\n"), ""))
|
|
if not out:
|
|
raise ExtractionError("Notebook contains no readable cells")
|
|
return "\n".join(out).rstrip("\n") + "\n"
|
|
|
|
|
|
def _zip_xml(zf: zipfile.ZipFile, name: str) -> ET.Element:
|
|
try:
|
|
return ET.fromstring(zf.read(name))
|
|
except KeyError as exc:
|
|
raise ExtractionError(f"Missing {name}") from exc
|
|
except ET.ParseError as exc:
|
|
raise ExtractionError(f"Malformed XML in {name}: {exc}") from exc
|
|
|
|
|
|
def _extract_docx(path: str) -> str:
|
|
try:
|
|
with zipfile.ZipFile(path) as zf:
|
|
root = _zip_xml(zf, "word/document.xml")
|
|
except zipfile.BadZipFile as exc:
|
|
raise ExtractionError(f"Not a valid DOCX: {exc}") from exc
|
|
except OSError as exc:
|
|
raise ExtractionError(str(exc)) from exc
|
|
|
|
w = f"{{{_NS_W}}}"
|
|
lines: list[str] = []
|
|
for para in root.iter(f"{w}p"):
|
|
buf: list[str] = []
|
|
for node in para.iter():
|
|
if node.tag == f"{w}t":
|
|
buf.append(node.text or "")
|
|
elif node.tag == f"{w}tab":
|
|
buf.append("\t")
|
|
elif node.tag in {f"{w}br", f"{w}cr"}:
|
|
buf.append("\n")
|
|
lines.extend("".join(buf).split("\n"))
|
|
if not any(line.strip() for line in lines):
|
|
raise ExtractionError("DOCX contains no extractable text")
|
|
return "\n".join(lines).rstrip("\n") + "\n"
|
|
|
|
|
|
def _extract_xlsx(path: str) -> str:
|
|
try:
|
|
with zipfile.ZipFile(path) as zf:
|
|
names = set(zf.namelist())
|
|
shared = _shared_strings(zf, names)
|
|
sheets = _workbook_sheets(zf)
|
|
rels = _workbook_rels(zf, names)
|
|
out: list[str] = []
|
|
for name, state, rid in sheets:
|
|
if state in {"hidden", "veryHidden"}:
|
|
continue
|
|
part = _sheet_part(rels.get(rid, ""))
|
|
if part not in names:
|
|
continue
|
|
try:
|
|
rows = _sheet_rows(zf.read(part), shared)
|
|
except ET.ParseError:
|
|
continue
|
|
out.append(f"# ── Sheet: {name} ──")
|
|
out.extend("\t".join(row) for row in rows)
|
|
if not rows:
|
|
out.append("(empty)")
|
|
out.append("")
|
|
except zipfile.BadZipFile as exc:
|
|
raise ExtractionError(f"Not a valid XLSX: {exc}") from exc
|
|
except OSError as exc:
|
|
raise ExtractionError(str(exc)) from exc
|
|
|
|
if not out:
|
|
raise ExtractionError("XLSX has no visible sheets with content")
|
|
return "\n".join(out).rstrip("\n") + "\n"
|
|
|
|
|
|
def _shared_strings(zf: zipfile.ZipFile, names: set[str]) -> list[str]:
|
|
if "xl/sharedStrings.xml" not in names:
|
|
return []
|
|
try:
|
|
root = ET.fromstring(zf.read("xl/sharedStrings.xml"))
|
|
except ET.ParseError:
|
|
return []
|
|
s = f"{{{_NS_S}}}"
|
|
return ["".join(t.text or "" for t in item.iter(f"{s}t")) for item in root.iter(f"{s}si")]
|
|
|
|
|
|
def _workbook_sheets(zf: zipfile.ZipFile) -> list[tuple[str, str, str]]:
|
|
root = _zip_xml(zf, "xl/workbook.xml")
|
|
s, r = f"{{{_NS_S}}}", f"{{{_NS_REL}}}"
|
|
return [
|
|
(sheet.get("name", "Sheet"), sheet.get("state", "visible"), sheet.get(f"{r}id", ""))
|
|
for sheet in root.iter(f"{s}sheet")
|
|
]
|
|
|
|
|
|
def _workbook_rels(zf: zipfile.ZipFile, names: set[str]) -> dict[str, str]:
|
|
rels_path = "xl/_rels/workbook.xml.rels"
|
|
if rels_path not in names:
|
|
return {}
|
|
try:
|
|
root = ET.fromstring(zf.read(rels_path))
|
|
except ET.ParseError:
|
|
return {}
|
|
rel_tag = f"{{{_NS_PKG_REL}}}Relationship"
|
|
return {rel.get("Id", ""): rel.get("Target", "") for rel in root.iter(rel_tag) if rel.get("Id")}
|
|
|
|
|
|
def _sheet_part(target: str) -> str:
|
|
target = target.lstrip("/")
|
|
return posixpath.normpath(target if target.startswith("xl/") else f"xl/{target}")
|
|
|
|
|
|
def _col_index(ref: str) -> int:
|
|
idx = 0
|
|
for ch in ref:
|
|
if not ch.isalpha():
|
|
break
|
|
idx = idx * 26 + ord(ch.upper()) - ord("A") + 1
|
|
return max(idx - 1, 0)
|
|
|
|
|
|
def _sheet_rows(xml_bytes: bytes, shared: list[str]) -> list[list[str]]:
|
|
root = ET.fromstring(xml_bytes)
|
|
s = f"{{{_NS_S}}}"
|
|
rows: list[list[str]] = []
|
|
for row in root.iter(f"{s}row"):
|
|
if len(rows) >= _MAX_XLSX_ROWS_PER_SHEET:
|
|
break
|
|
cells: dict[int, str] = {}
|
|
max_col = -1
|
|
for cell in row.iter(f"{s}c"):
|
|
col = _col_index(cell.get("r", "")) if cell.get("r") else max_col + 1
|
|
if col >= _MAX_XLSX_COLS:
|
|
continue
|
|
cells[col] = _cell_value(cell, shared, s)
|
|
max_col = max(max_col, col)
|
|
rows.append([cells.get(i, "") for i in range(max_col + 1)] if max_col >= 0 else [])
|
|
while rows and not any(value.strip() for value in rows[-1]):
|
|
rows.pop()
|
|
return rows
|
|
|
|
|
|
def _cell_value(cell: ET.Element, shared: list[str], s: str) -> str:
|
|
value = cell.findtext(f"{s}v") or ""
|
|
typ = cell.get("t", "")
|
|
if typ == "s":
|
|
try:
|
|
return shared[int(value)]
|
|
except (ValueError, IndexError):
|
|
return ""
|
|
if typ == "inlineStr":
|
|
inline = cell.find(f"{s}is")
|
|
return "" if inline is None else "".join(t.text or "" for t in inline.iter(f"{s}t"))
|
|
if typ == "b":
|
|
return "TRUE" if value.strip() in {"1", "true", "TRUE"} else "FALSE"
|
|
if typ == "e":
|
|
return value or "#ERROR"
|
|
return value
|