fix: persist non-NULL system prompt on fresh turn setup (#45499) (#52616)

build_turn_context() created the DB session row via _ensure_db_session()
before the system prompt was restored/built, so a fresh API/gateway agent
carrying client-managed history inserted a row with system_prompt=NULL. That
tripped the misleading 'stored system prompt is null; rebuilding from scratch
... investigate the previous turn's write path' warning and a guaranteed
first-turn prefix cache miss. Move row creation to after _cached_system_prompt
is populated.

Verified live (OpenRouter + claude-sonnet-4.5): persistent-agent turns show
cache_read jumping to the full prefix on turn 2+ (write 24411 -> read 24411),
and the persisted system_prompt is non-NULL so fresh-agent restore keeps the
prefix cache warm.

Tests: turn-context ordering regression asserting _ensure_db_session runs
after _cached_system_prompt is populated.
This commit is contained in:
Teknium 2026-06-25 12:54:19 -07:00 committed by GitHub
parent d7021af30f
commit 2a1e615565
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 39 additions and 2 deletions

View file

@ -142,7 +142,13 @@ def build_turn_context(
# Guard stdio against OSError from broken pipes (systemd/headless/daemon).
install_safe_stdio()
agent._ensure_db_session()
# NOTE: the DB session row is created later, AFTER the system prompt is
# restored/built (see _ensure_db_session() below the system-prompt block).
# Creating it here — before _cached_system_prompt is populated — inserts a
# row with system_prompt=NULL on a fresh API/gateway agent that carries
# client-managed history, which then trips the "stored system prompt is
# null; rebuilding from scratch" warning and a needless first-turn prefix
# cache miss. (Issue #45499.)
# Tell auxiliary_client what the live main provider/model are for this turn.
try:
@ -309,6 +315,11 @@ def build_turn_context(
active_system_prompt = agent._cached_system_prompt
# Create the DB session row now that _cached_system_prompt is populated, so
# the persisted snapshot is written non-NULL on the first turn (Issue
# #45499). Idempotent: _ensure_db_session() no-ops once the row exists.
agent._ensure_db_session()
# Crash-resilience: persist the inbound user turn as soon as the session row exists.
try:
agent._persist_session(messages, conversation_history)

View file

@ -71,10 +71,13 @@ class _FakeAgent:
self._invalid_tool_retries = -1
self._vision_supported = None
self._persist_calls = 0
# Records _cached_system_prompt at the moment _ensure_db_session()
# is called (regression guard for #45499 turn-setup ordering).
self._ensure_db_prompt_at_call = "<unset>"
# --- methods the prologue calls ---
def _ensure_db_session(self):
pass
self._ensure_db_prompt_at_call = self._cached_system_prompt
def _restore_primary_runtime(self):
pass
@ -190,6 +193,29 @@ def test_no_review_when_memory_disabled():
assert ctx.should_review_memory is False
def test_ensure_db_session_runs_after_system_prompt_restore():
"""Regression for #45499.
On a fresh API/gateway agent (``_cached_system_prompt is None``) the DB
session row must be created AFTER the system prompt is restored/built, so
the persisted snapshot is written non-NULL. If ``_ensure_db_session()``
ran first it would insert ``system_prompt=NULL`` and trip the misleading
"stored system prompt is null; rebuilding" warning plus a first-turn
prefix cache miss.
"""
agent = _FakeAgent()
agent._cached_system_prompt = None # fresh agent, no cached prompt yet
def _restore(_agent, _system_message, _history):
_agent._cached_system_prompt = "REBUILT-SYSTEM"
_build(agent, restore_or_build_system_prompt=_restore)
# The prompt was populated before the DB row was created.
assert agent._ensure_db_prompt_at_call == "REBUILT-SYSTEM"
assert agent._cached_system_prompt == "REBUILT-SYSTEM"
# ── Between-turns MCP refresh (cache-safe late-binding) ──────────────────────
#
# A slow MCP server that connects after the agent's build-time tool snapshot