mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-12 08:51:53 +00:00
fix(gateway): guide the agent to read attached PDF/DOCX instead of punting
When a user attached a binary document (PDF, DOCX, XLSX, …) in chat, the context note prepended to the turn said "Ask the user what they'd like you to do with it." That steered the model into asking the user to paste the contents rather than extracting the text it is fully capable of reading — so attached PDFs/DOCX appeared "unreadable" to the agent. Rewrite the binary-document note to tell the agent the file is a non-text format saved at the given path and to extract its text itself (e.g. via the terminal tool or the ocr-and-documents skill) before answering. Text documents (whose content is already inlined by the platform adapter) keep their existing note. The note construction is pulled into a small `_build_document_context_note` helper so it is unit-testable.
This commit is contained in:
parent
ce99a81123
commit
e7ae145ac4
1 changed files with 28 additions and 12 deletions
|
|
@ -1425,6 +1425,33 @@ def _build_media_placeholder(event) -> str:
|
|||
return "\n".join(parts)
|
||||
|
||||
|
||||
def _build_document_context_note(display_name: str, agent_path: str, mtype: str) -> str:
|
||||
"""Context note prepended to a user turn when they attach a document.
|
||||
|
||||
Text documents (``text/*``) have their content inlined upstream by the
|
||||
platform adapter, so the note just confirms that and records the path.
|
||||
|
||||
Binary documents (PDF, DOCX, XLSX, …) cannot be inlined as text. The note
|
||||
must tell the agent to *extract* the text itself before answering — earlier
|
||||
wording ("Ask the user what they'd like you to do with it") steered the
|
||||
model into punting back to the user, which is why attached PDFs/DOCX looked
|
||||
"unreadable" to the agent even though it has the tools to read them.
|
||||
"""
|
||||
if mtype.startswith("text/"):
|
||||
return (
|
||||
f"[The user sent a text document: '{display_name}'. "
|
||||
f"Its content has been included below. "
|
||||
f"The file is also saved at: {agent_path}]"
|
||||
)
|
||||
return (
|
||||
f"[The user sent a document: '{display_name}'. It is saved at: {agent_path}. "
|
||||
f"Its text is not inlined here (it's a binary format such as PDF or DOCX). "
|
||||
f"To read it, extract the document's text yourself — for example with the "
|
||||
f"terminal tool or the ocr-and-documents skill — before answering, instead "
|
||||
f"of asking the user to paste the contents.]"
|
||||
)
|
||||
|
||||
|
||||
def _format_duration(seconds: float) -> str:
|
||||
total = int(round(seconds))
|
||||
if total < 0:
|
||||
|
|
@ -7732,18 +7759,7 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
# cache directories are auto-mounted at /root/.hermes/cache/* by get_cache_directory_mounts().
|
||||
agent_path = to_agent_visible_cache_path(path)
|
||||
|
||||
if mtype.startswith("text/"):
|
||||
context_note = (
|
||||
f"[The user sent a text document: '{display_name}'. "
|
||||
f"Its content has been included below. "
|
||||
f"The file is also saved at: {agent_path}]"
|
||||
)
|
||||
else:
|
||||
context_note = (
|
||||
f"[The user sent a document: '{display_name}'. "
|
||||
f"The file is saved at: {agent_path}. "
|
||||
f"Ask the user what they'd like you to do with it.]"
|
||||
)
|
||||
context_note = _build_document_context_note(display_name, agent_path, mtype)
|
||||
message_text = f"{context_note}\n\n{message_text}"
|
||||
|
||||
if getattr(event, "reply_to_text", None) and event.reply_to_message_id:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue