From e7ae145ac42cb129f8a529b40bb1749535885bbe Mon Sep 17 00:00:00 2001 From: xxxigm Date: Thu, 11 Jun 2026 16:51:06 +0700 Subject: [PATCH] fix(gateway): guide the agent to read attached PDF/DOCX instead of punting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a user attached a binary document (PDF, DOCX, XLSX, …) in chat, the context note prepended to the turn said "Ask the user what they'd like you to do with it." That steered the model into asking the user to paste the contents rather than extracting the text it is fully capable of reading — so attached PDFs/DOCX appeared "unreadable" to the agent. Rewrite the binary-document note to tell the agent the file is a non-text format saved at the given path and to extract its text itself (e.g. via the terminal tool or the ocr-and-documents skill) before answering. Text documents (whose content is already inlined by the platform adapter) keep their existing note. The note construction is pulled into a small `_build_document_context_note` helper so it is unit-testable. --- gateway/run.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 041a6efcd56..85267cc44ee 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1425,6 +1425,33 @@ def _build_media_placeholder(event) -> str: return "\n".join(parts) +def _build_document_context_note(display_name: str, agent_path: str, mtype: str) -> str: + """Context note prepended to a user turn when they attach a document. + + Text documents (``text/*``) have their content inlined upstream by the + platform adapter, so the note just confirms that and records the path. + + Binary documents (PDF, DOCX, XLSX, …) cannot be inlined as text. The note + must tell the agent to *extract* the text itself before answering — earlier + wording ("Ask the user what they'd like you to do with it") steered the + model into punting back to the user, which is why attached PDFs/DOCX looked + "unreadable" to the agent even though it has the tools to read them. + """ + if mtype.startswith("text/"): + return ( + f"[The user sent a text document: '{display_name}'. " + f"Its content has been included below. " + f"The file is also saved at: {agent_path}]" + ) + return ( + f"[The user sent a document: '{display_name}'. It is saved at: {agent_path}. " + f"Its text is not inlined here (it's a binary format such as PDF or DOCX). " + f"To read it, extract the document's text yourself — for example with the " + f"terminal tool or the ocr-and-documents skill — before answering, instead " + f"of asking the user to paste the contents.]" + ) + + def _format_duration(seconds: float) -> str: total = int(round(seconds)) if total < 0: @@ -7732,18 +7759,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # cache directories are auto-mounted at /root/.hermes/cache/* by get_cache_directory_mounts(). agent_path = to_agent_visible_cache_path(path) - if mtype.startswith("text/"): - context_note = ( - f"[The user sent a text document: '{display_name}'. " - f"Its content has been included below. " - f"The file is also saved at: {agent_path}]" - ) - else: - context_note = ( - f"[The user sent a document: '{display_name}'. " - f"The file is saved at: {agent_path}. " - f"Ask the user what they'd like you to do with it.]" - ) + context_note = _build_document_context_note(display_name, agent_path, mtype) message_text = f"{context_note}\n\n{message_text}" if getattr(event, "reply_to_text", None) and event.reply_to_message_id: