From 13650ab7f86f43d3ff107bb14c444794c56aab72 Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Thu, 11 Jun 2026 11:39:35 -0700 Subject: [PATCH] fix(gateway): audio attachment note no longer steers the agent into punting Sibling site of the PDF/DOCX note fixed in PR #44175: the audio file attachment context note led with "Ask the user what they'd like you to do with it", steering the model into asking instead of transcribing. Rewritten to instruct the agent to transcribe/process the file itself when the request involves its content, only asking when intent is genuinely unclear. Contract assertion added to the existing audio attachment note test. --- gateway/run.py | 6 +++++- tests/gateway/test_telegram_audio_vs_voice.py | 4 ++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 85267cc44ee..817c8441bae 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -7727,7 +7727,11 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew _note = ( f"[The user sent an audio file attachment: '{_display}'. " f"It is saved at: {_agent_path}. " - f"Ask the user what they'd like you to do with it, or pass the path to a transcription or media tool.]" + f"Its content is not inlined here. If the user's request involves " + f"what the audio contains, transcribe or process it yourself — for " + f"example by passing the path to a transcription or media tool — " + f"instead of asking the user to describe it. Only ask what to do " + f"with it if their intent is genuinely unclear.]" ) message_text = f"{_note}\n\n{message_text}" diff --git a/tests/gateway/test_telegram_audio_vs_voice.py b/tests/gateway/test_telegram_audio_vs_voice.py index 5af5cb920a7..1d1bf0cb78b 100644 --- a/tests/gateway/test_telegram_audio_vs_voice.py +++ b/tests/gateway/test_telegram_audio_vs_voice.py @@ -134,6 +134,10 @@ async def test_audio_attachment_context_note_format(): assert "audio file attachment" in result.lower() # Should NOT contain the voice-message transcription wrapper text assert "voice message" not in result.lower() + # Guides the agent to transcribe/process the file itself rather than + # punting back to the user (same bug class as the PDF/DOCX note). + assert "transcri" in result.lower() + assert "ask the user what they'd like" not in result.lower() # ---------------------------------------------------------------------------