From d99e2a29d66135675cf4a237b0131fbdf0927118 Mon Sep 17 00:00:00 2001
From: waxinz <waxinz@users.noreply.github.com>
Date: Sat, 11 Apr 2026 16:29:02 -0700
Subject: [PATCH] feat: standardize message whitespace and JSON formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Normalize api_messages before each API call for consistent prefix
matching across turns:

1. Strip leading/trailing whitespace from system prompt parts
2. Strip leading/trailing whitespace from message content strings
3. Normalize tool-call arguments to compact sorted JSON

This enables KV cache reuse on local inference servers (llama.cpp,
vLLM, Ollama) and improves cache hit rates for cloud providers.

All normalization operates on the api_messages copy — the original
conversation history in messages is never mutated.  Tool-call JSON
normalization creates new dicts via spread to avoid the shallow-copy
mutation bug in the original PR.

Salvaged from PR #7875 by @waxinz with mutation fix.
---
 run_agent.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/run_agent.py b/run_agent.py
index 7b97c0ded22..ecaa92b41c9 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -3212,7 +3212,7 @@ class AIAgent:
         if platform_key in PLATFORM_HINTS:
             prompt_parts.append(PLATFORM_HINTS[platform_key])
 
-        return "\n\n".join(prompt_parts)
+        return "\n\n".join(p.strip() for p in prompt_parts if p.strip())
 
     # =========================================================================
     # Pre/post-call guardrails (inspired by PR #1321 — @alireza78a)
@@ -8047,6 +8047,36 @@ class AIAgent:
             # manual message manipulation are always caught.
             api_messages = self._sanitize_api_messages(api_messages)
 
+            # Normalize message whitespace and tool-call JSON for consistent
+            # prefix matching.  Ensures bit-perfect prefixes across turns,
+            # which enables KV cache reuse on local inference servers
+            # (llama.cpp, vLLM, Ollama) and improves cache hit rates for
+            # cloud providers.  Operates on api_messages (the API copy) so
+            # the original conversation history in `messages` is untouched.
+            for am in api_messages:
+                if isinstance(am.get("content"), str):
+                    am["content"] = am["content"].strip()
+            for am in api_messages:
+                tcs = am.get("tool_calls")
+                if not tcs:
+                    continue
+                new_tcs = []
+                for tc in tcs:
+                    if isinstance(tc, dict) and "function" in tc:
+                        try:
+                            args_obj = json.loads(tc["function"]["arguments"])
+                            tc = {**tc, "function": {
+                                **tc["function"],
+                                "arguments": json.dumps(
+                                    args_obj, separators=(",", ":"),
+                                    sort_keys=True,
+                                ),
+                            }}
+                        except Exception:
+                            pass
+                    new_tcs.append(tc)
+                am["tool_calls"] = new_tcs
+
             # Calculate approximate request size for logging
             total_chars = sum(len(str(msg)) for msg in api_messages)
             approx_tokens = estimate_messages_tokens_rough(api_messages)