From 817f39231145bbb24eb57b8b2835cb00c91e87ed Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Sat, 13 Jun 2026 14:42:51 -0700 Subject: [PATCH] feat(read): extract notebook and office documents (#37082) Add stdlib-only extraction for `.ipynb`, `.docx`, and `.xlsx` in read_file with lazy integration and malformed-document fallback. --- tests/tools/test_read_extract.py | 295 +++++++++++++++++++++++++++++++ tools/file_tools.py | 48 ++++- tools/read_extract.py | 248 ++++++++++++++++++++++++++ 3 files changed, 590 insertions(+), 1 deletion(-) create mode 100644 tests/tools/test_read_extract.py create mode 100644 tools/read_extract.py diff --git a/tests/tools/test_read_extract.py b/tests/tools/test_read_extract.py new file mode 100644 index 00000000000..3757e03c43b --- /dev/null +++ b/tests/tools/test_read_extract.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 +""" +Tests for structured-document extraction in the read_file tool. + +Covers .ipynb / .docx / .xlsx extraction (ported from Kilo-Org/kilocode +#10733, #10737, #10740) and the read_file_tool integration: pagination, +line-numbering, graceful fallback on malformed input, and hidden-sheet +omission. + +Run with: python -m pytest tests/tools/test_read_extract.py -v +""" + +import json +import os +import tempfile +import unittest +import zipfile + +from tools.read_extract import ( + ExtractionError, + extract_document_text, + is_extractable_document, +) +from tools.file_tools import read_file_tool + + +# --------------------------------------------------------------------------- +# Fixture builders — construct minimal valid OOXML / notebook files. +# --------------------------------------------------------------------------- + +def _write_notebook(path, cells, nbformat=4): + nb = {"cells": cells, "metadata": {}, "nbformat": nbformat, "nbformat_minor": 5} + with open(path, "w", encoding="utf-8") as fh: + json.dump(nb, fh) + + +def _write_docx(path, document_xml): + with zipfile.ZipFile(path, "w") as z: + z.writestr("[Content_Types].xml", "") + z.writestr("word/document.xml", document_xml) + + +def _write_xlsx(path, *, workbook, rels, shared, sheets): + """sheets: dict of part-name -> xml string.""" + with zipfile.ZipFile(path, "w") as z: + z.writestr("xl/workbook.xml", workbook) + z.writestr("xl/_rels/workbook.xml.rels", rels) + if shared is not None: + z.writestr("xl/sharedStrings.xml", shared) + for part, xml in sheets.items(): + z.writestr(part, xml) + + +_NS_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_NS_S = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" + + +# --------------------------------------------------------------------------- +# is_extractable_document +# --------------------------------------------------------------------------- + +class TestIsExtractable(unittest.TestCase): + def test_recognized_extensions(self): + self.assertTrue(is_extractable_document("a.ipynb")) + self.assertTrue(is_extractable_document("/x/B.DOCX")) + self.assertTrue(is_extractable_document("report.xlsx")) + + def test_unrecognized_extensions(self): + self.assertFalse(is_extractable_document("a.py")) + self.assertFalse(is_extractable_document("a.pdf")) + self.assertFalse(is_extractable_document("a.txt")) + + +# --------------------------------------------------------------------------- +# Notebooks (.ipynb) — #10733 +# --------------------------------------------------------------------------- + +class TestNotebookExtraction(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="rex_nb_") + + def tearDown(self): + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_markdown_and_code_in_order(self): + p = os.path.join(self.tmp, "nb.ipynb") + _write_notebook(p, [ + {"cell_type": "markdown", "source": ["# Title\n", "para"]}, + {"cell_type": "code", "source": "x = 1\nprint(x)", + "outputs": [{"output_type": "stream", "text": ["1\n"]}], + "execution_count": 1}, + ]) + text = extract_document_text(p) + self.assertIn("# Title", text) + self.assertIn("print(x)", text) + # Output payloads must NOT leak into the extracted text. + self.assertNotIn("output_type", text) + self.assertNotIn("execution_count", text) + # Order preserved: markdown before code. + self.assertLess(text.index("Title"), text.index("print(x)")) + + def test_string_source_form(self): + p = os.path.join(self.tmp, "nb2.ipynb") + _write_notebook(p, [{"cell_type": "code", "source": "single string source"}]) + self.assertIn("single string source", extract_document_text(p)) + + def test_legacy_worksheets_form(self): + p = os.path.join(self.tmp, "nb3.ipynb") + nb = {"worksheets": [{"cells": [ + {"cell_type": "code", "input": "ignored", "source": "legacy cell"}]}], + "nbformat": 3} + with open(p, "w") as fh: + json.dump(nb, fh) + self.assertIn("legacy cell", extract_document_text(p)) + + def test_malformed_notebook_raises(self): + p = os.path.join(self.tmp, "bad.ipynb") + with open(p, "w") as fh: + fh.write("{ not valid json") + with self.assertRaises(ExtractionError): + extract_document_text(p) + + def test_empty_cells_raises(self): + p = os.path.join(self.tmp, "empty.ipynb") + _write_notebook(p, []) + with self.assertRaises(ExtractionError): + extract_document_text(p) + + +# --------------------------------------------------------------------------- +# Word documents (.docx) — #10737 +# --------------------------------------------------------------------------- + +class TestDocxExtraction(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="rex_docx_") + + def tearDown(self): + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _doc(self, body): + return (f'' + f'{body}') + + def test_paragraphs_and_runs(self): + p = os.path.join(self.tmp, "d.docx") + _write_docx(p, self._doc( + 'Hello World' + 'Second')) + text = extract_document_text(p) + self.assertIn("Hello World", text) + self.assertIn("Second", text) + + def test_tabs_and_breaks(self): + p = os.path.join(self.tmp, "d2.docx") + _write_docx(p, self._doc( + 'ABC')) + text = extract_document_text(p) + self.assertIn("A\tB", text) + self.assertIn("C", text) + + def test_not_a_zip_raises(self): + p = os.path.join(self.tmp, "bad.docx") + with open(p, "wb") as fh: + fh.write(b"plain bytes, not a zip") + with self.assertRaises(ExtractionError): + extract_document_text(p) + + def test_missing_document_xml_raises(self): + p = os.path.join(self.tmp, "nodoc.docx") + with zipfile.ZipFile(p, "w") as z: + z.writestr("other.xml", "") + with self.assertRaises(ExtractionError): + extract_document_text(p) + + +# --------------------------------------------------------------------------- +# Excel workbooks (.xlsx) — #10740 +# --------------------------------------------------------------------------- + +class TestXlsxExtraction(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="rex_xlsx_") + + def tearDown(self): + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def _build(self, path, *, include_hidden=True): + r = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + hidden_sheet = (f'') if include_hidden else "" + workbook = ( + f'' + f'{hidden_sheet}' + f'') + rels = ( + '' + '' + '' + '') + shared = (f'NameScore' + f'Alice') + sheet1 = ( + f'' + '01' + '295' + '') + sheet2 = (f'' + 'SECRETDATA' + '') + _write_xlsx(path, workbook=workbook, rels=rels, shared=shared, + sheets={"xl/worksheets/sheet1.xml": sheet1, + "xl/worksheets/sheet2.xml": sheet2}) + + def test_visible_sheet_content(self): + p = os.path.join(self.tmp, "wb.xlsx") + self._build(p) + text = extract_document_text(p) + self.assertIn("Data", text) # sheet label + self.assertIn("Name\tScore", text) # shared-string header row + self.assertIn("Alice\t95", text) # string + numeric cells + + def test_hidden_sheet_omitted(self): + p = os.path.join(self.tmp, "wb2.xlsx") + self._build(p) + text = extract_document_text(p) + self.assertNotIn("SECRETDATA", text) + self.assertNotIn("Hidden", text) + + def test_not_a_zip_raises(self): + p = os.path.join(self.tmp, "bad.xlsx") + with open(p, "wb") as fh: + fh.write(b"nope") + with self.assertRaises(ExtractionError): + extract_document_text(p) + + +# --------------------------------------------------------------------------- +# read_file_tool integration +# --------------------------------------------------------------------------- + +class TestReadFileToolIntegration(unittest.TestCase): + def setUp(self): + self.tmp = tempfile.mkdtemp(prefix="rex_int_") + + def tearDown(self): + import shutil + shutil.rmtree(self.tmp, ignore_errors=True) + + def test_notebook_read_is_line_numbered(self): + p = os.path.join(self.tmp, "nb.ipynb") + _write_notebook(p, [ + {"cell_type": "markdown", "source": "# H"}, + {"cell_type": "code", "source": "print(1)"}, + ]) + res = json.loads(read_file_tool(p)) + self.assertTrue(res.get("extracted_document")) + self.assertIn("1|", res["content"]) # line-number gutter + self.assertIn("print(1)", res["content"]) + + def test_pagination(self): + p = os.path.join(self.tmp, "nb.ipynb") + _write_notebook(p, [ + {"cell_type": "code", "source": "a\nb\nc\nd\ne\nf"}, + ]) + res = json.loads(read_file_tool(p, offset=1, limit=2)) + self.assertTrue(res.get("truncated")) + self.assertIn("offset=3", res.get("hint", "")) + # Only first 2 lines present. + self.assertIn("1|# ── Code cell 1 ──", res["content"]) + + def test_corrupt_docx_falls_through_to_binary_guard(self): + p = os.path.join(self.tmp, "bad.docx") + with open(p, "wb") as fh: + fh.write(b"not a zip") + res = json.loads(read_file_tool(p)) + # Should NOT crash; falls through to the binary-extension guard. + self.assertIn("error", res) + self.assertIn("binary", res["error"].lower()) + + def test_docx_read_extracts(self): + p = os.path.join(self.tmp, "d.docx") + _write_docx(p, (f'' + 'Report body' + '')) + res = json.loads(read_file_tool(p)) + self.assertTrue(res.get("extracted_document")) + self.assertIn("Report body", res["content"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/file_tools.py b/tools/file_tools.py index 4703cb4e5f7..c0b2fd06628 100644 --- a/tools/file_tools.py +++ b/tools/file_tools.py @@ -760,6 +760,52 @@ def read_file_tool(path: str, offset: int = 1, limit: int = 500, task_id: str = _resolved = _resolve_path_for_task(path, task_id) + # ── Structured-document extraction ──────────────────────────── + # Try before the binary-extension guard so .docx/.xlsx can render as text. + # Malformed documents fall through to the normal path/binary guard. + from tools.read_extract import ExtractionError, extract_document_text, is_extractable_document + + if is_extractable_document(str(_resolved)): + try: + extracted_text = extract_document_text(str(_resolved)) + except ExtractionError: + logger.debug("document extraction failed for %s", path, exc_info=True) + else: + file_ops = _get_file_ops(task_id) + lines = extracted_text.splitlines() + total_lines = len(lines) + end_line = offset + limit - 1 + page_text = "\n".join(lines[offset - 1:end_line]) + result_dict = { + "content": file_ops._add_line_numbers(page_text, offset) if page_text else "", + "total_lines": total_lines, + "file_size": os.path.getsize(_resolved), + "truncated": total_lines > end_line, + "extracted_document": True, + } + if result_dict["truncated"]: + result_dict["hint"] = ( + f"Use offset={end_line + 1} to continue reading " + f"(showing {offset}-{min(end_line, total_lines)} of {total_lines} lines)" + ) + content_len = len(result_dict["content"]) + max_chars = _get_max_read_chars() + if content_len > max_chars: + return json.dumps({ + "error": ( + f"Read produced {content_len:,} characters which exceeds " + f"the safety limit ({max_chars:,} chars). " + "Use offset and limit to read a smaller range. " + f"The document has {total_lines} lines of extracted text." + ), + "path": path, + "total_lines": total_lines, + "file_size": result_dict["file_size"], + }, ensure_ascii=False) + if result_dict["content"]: + result_dict["content"] = redact_sensitive_text(result_dict["content"], code_file=True) + return json.dumps(result_dict, ensure_ascii=False) + # ── Binary file guard ───────────────────────────────────────── # Block binary files by extension (no I/O). if has_binary_extension(str(_resolved)): @@ -1427,7 +1473,7 @@ def _check_file_reqs(): READ_FILE_SCHEMA = { "name": "read_file", - "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. NOTE: Cannot read images or binary files — use vision_analyze for images.", + "description": "Read a text file with line numbers and pagination. Use this instead of cat/head/tail in terminal. Output format: 'LINE_NUM|CONTENT'. Suggests similar filenames if not found. Use offset and limit for large files. Reads exceeding ~100K characters are rejected; use offset and limit to read specific sections of large files. Jupyter notebooks (.ipynb), Word documents (.docx), and Excel workbooks (.xlsx) are auto-extracted to readable text. NOTE: Cannot read images or other binary files — use vision_analyze for images.", "parameters": { "type": "object", "properties": { diff --git a/tools/read_extract.py b/tools/read_extract.py new file mode 100644 index 00000000000..3607703de60 --- /dev/null +++ b/tools/read_extract.py @@ -0,0 +1,248 @@ +"""Stdlib document-to-text extraction for ``read_file``. + +Supports Jupyter notebooks, DOCX, and XLSX without adding hard dependencies. +Malformed documents raise :class:`ExtractionError`; callers can then fall back to +normal text/binary handling. +""" + +from __future__ import annotations + +import json +import posixpath +import zipfile +from pathlib import Path +from xml.etree import ElementTree as ET + +__all__ = ["EXTRACTABLE_EXTENSIONS", "ExtractionError", "extract_document_text", "is_extractable_document"] + +EXTRACTABLE_EXTENSIONS = frozenset({".ipynb", ".docx", ".xlsx"}) +MAX_XLSX_BYTES = 50 * 1024 * 1024 +_MAX_XLSX_ROWS_PER_SHEET = 5000 +_MAX_XLSX_COLS = 256 + +_NS_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_NS_S = "http://schemas.openxmlformats.org/spreadsheetml/2006/main" +_NS_REL = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" +_NS_PKG_REL = "http://schemas.openxmlformats.org/package/2006/relationships" + + +class ExtractionError(Exception): + """Raised when a supported-looking document cannot be rendered as text.""" + + +def _extension(path: str) -> str: + ext = Path(path).suffix.lower() + return ext if ext in EXTRACTABLE_EXTENSIONS else "" + + +def is_extractable_document(path: str) -> bool: + return bool(_extension(path)) + + +def extract_document_text(path: str) -> str: + ext = _extension(path) + if ext == ".ipynb": + return _extract_notebook(path) + if ext == ".docx": + return _extract_docx(path) + if ext == ".xlsx": + return _extract_xlsx(path) + raise ExtractionError(f"Unsupported document type: {path!r}") + + +def _source_text(source) -> str: + if isinstance(source, str): + return source + if isinstance(source, list): + return "".join(item for item in source if isinstance(item, str)) + return "" + + +def _extract_notebook(path: str) -> str: + try: + with open(path, encoding="utf-8", errors="replace") as fh: + nb = json.load(fh) + except (OSError, ValueError, json.JSONDecodeError) as exc: + raise ExtractionError(f"Not a valid notebook: {exc}") from exc + if not isinstance(nb, dict): + raise ExtractionError("Notebook root is not an object") + + cells = nb.get("cells") + if not isinstance(cells, list): + cells = [ + cell + for ws in nb.get("worksheets", []) + if isinstance(ws, dict) + for cell in ws.get("cells", []) + ] + if not cells: + raise ExtractionError("Notebook contains no cells") + + counts = {"markdown": 0, "code": 0, "raw": 0} + labels = {"markdown": "Markdown", "code": "Code", "raw": "Raw"} + out: list[str] = [] + for cell in cells: + if not isinstance(cell, dict): + continue + typ = cell.get("cell_type") + if typ not in labels: + continue + counts[typ] += 1 + suffix = f" {counts[typ]}" if typ != "raw" else "" + out.extend((f"# ── {labels[typ]} cell{suffix} ──", _source_text(cell.get("source", "")).rstrip("\n"), "")) + if not out: + raise ExtractionError("Notebook contains no readable cells") + return "\n".join(out).rstrip("\n") + "\n" + + +def _zip_xml(zf: zipfile.ZipFile, name: str) -> ET.Element: + try: + return ET.fromstring(zf.read(name)) + except KeyError as exc: + raise ExtractionError(f"Missing {name}") from exc + except ET.ParseError as exc: + raise ExtractionError(f"Malformed XML in {name}: {exc}") from exc + + +def _extract_docx(path: str) -> str: + try: + with zipfile.ZipFile(path) as zf: + root = _zip_xml(zf, "word/document.xml") + except zipfile.BadZipFile as exc: + raise ExtractionError(f"Not a valid DOCX: {exc}") from exc + except OSError as exc: + raise ExtractionError(str(exc)) from exc + + w = f"{{{_NS_W}}}" + lines: list[str] = [] + for para in root.iter(f"{w}p"): + buf: list[str] = [] + for node in para.iter(): + if node.tag == f"{w}t": + buf.append(node.text or "") + elif node.tag == f"{w}tab": + buf.append("\t") + elif node.tag in {f"{w}br", f"{w}cr"}: + buf.append("\n") + lines.extend("".join(buf).split("\n")) + if not any(line.strip() for line in lines): + raise ExtractionError("DOCX contains no extractable text") + return "\n".join(lines).rstrip("\n") + "\n" + + +def _extract_xlsx(path: str) -> str: + try: + with zipfile.ZipFile(path) as zf: + names = set(zf.namelist()) + shared = _shared_strings(zf, names) + sheets = _workbook_sheets(zf) + rels = _workbook_rels(zf, names) + out: list[str] = [] + for name, state, rid in sheets: + if state in {"hidden", "veryHidden"}: + continue + part = _sheet_part(rels.get(rid, "")) + if part not in names: + continue + try: + rows = _sheet_rows(zf.read(part), shared) + except ET.ParseError: + continue + out.append(f"# ── Sheet: {name} ──") + out.extend("\t".join(row) for row in rows) + if not rows: + out.append("(empty)") + out.append("") + except zipfile.BadZipFile as exc: + raise ExtractionError(f"Not a valid XLSX: {exc}") from exc + except OSError as exc: + raise ExtractionError(str(exc)) from exc + + if not out: + raise ExtractionError("XLSX has no visible sheets with content") + return "\n".join(out).rstrip("\n") + "\n" + + +def _shared_strings(zf: zipfile.ZipFile, names: set[str]) -> list[str]: + if "xl/sharedStrings.xml" not in names: + return [] + try: + root = ET.fromstring(zf.read("xl/sharedStrings.xml")) + except ET.ParseError: + return [] + s = f"{{{_NS_S}}}" + return ["".join(t.text or "" for t in item.iter(f"{s}t")) for item in root.iter(f"{s}si")] + + +def _workbook_sheets(zf: zipfile.ZipFile) -> list[tuple[str, str, str]]: + root = _zip_xml(zf, "xl/workbook.xml") + s, r = f"{{{_NS_S}}}", f"{{{_NS_REL}}}" + return [ + (sheet.get("name", "Sheet"), sheet.get("state", "visible"), sheet.get(f"{r}id", "")) + for sheet in root.iter(f"{s}sheet") + ] + + +def _workbook_rels(zf: zipfile.ZipFile, names: set[str]) -> dict[str, str]: + rels_path = "xl/_rels/workbook.xml.rels" + if rels_path not in names: + return {} + try: + root = ET.fromstring(zf.read(rels_path)) + except ET.ParseError: + return {} + rel_tag = f"{{{_NS_PKG_REL}}}Relationship" + return {rel.get("Id", ""): rel.get("Target", "") for rel in root.iter(rel_tag) if rel.get("Id")} + + +def _sheet_part(target: str) -> str: + target = target.lstrip("/") + return posixpath.normpath(target if target.startswith("xl/") else f"xl/{target}") + + +def _col_index(ref: str) -> int: + idx = 0 + for ch in ref: + if not ch.isalpha(): + break + idx = idx * 26 + ord(ch.upper()) - ord("A") + 1 + return max(idx - 1, 0) + + +def _sheet_rows(xml_bytes: bytes, shared: list[str]) -> list[list[str]]: + root = ET.fromstring(xml_bytes) + s = f"{{{_NS_S}}}" + rows: list[list[str]] = [] + for row in root.iter(f"{s}row"): + if len(rows) >= _MAX_XLSX_ROWS_PER_SHEET: + break + cells: dict[int, str] = {} + max_col = -1 + for cell in row.iter(f"{s}c"): + col = _col_index(cell.get("r", "")) if cell.get("r") else max_col + 1 + if col >= _MAX_XLSX_COLS: + continue + cells[col] = _cell_value(cell, shared, s) + max_col = max(max_col, col) + rows.append([cells.get(i, "") for i in range(max_col + 1)] if max_col >= 0 else []) + while rows and not any(value.strip() for value in rows[-1]): + rows.pop() + return rows + + +def _cell_value(cell: ET.Element, shared: list[str], s: str) -> str: + value = cell.findtext(f"{s}v") or "" + typ = cell.get("t", "") + if typ == "s": + try: + return shared[int(value)] + except (ValueError, IndexError): + return "" + if typ == "inlineStr": + inline = cell.find(f"{s}is") + return "" if inline is None else "".join(t.text or "" for t in inline.iter(f"{s}t")) + if typ == "b": + return "TRUE" if value.strip() in {"1", "true", "TRUE"} else "FALSE" + if typ == "e": + return value or "#ERROR" + return value