diff --git a/apps/desktop/src/lib/chat-messages.test.ts b/apps/desktop/src/lib/chat-messages.test.ts index f32310adfcf..44a6915fed7 100644 --- a/apps/desktop/src/lib/chat-messages.test.ts +++ b/apps/desktop/src/lib/chat-messages.test.ts @@ -3,6 +3,7 @@ import { describe, expect, it } from 'vitest' import type { ChatMessage, ChatMessagePart } from './chat-messages' import { appendAssistantTextPart, + appendReasoningPart, chatMessageText, preserveLocalAssistantErrors, renderMediaTags, @@ -175,6 +176,52 @@ describe('renderMediaTags', () => { }) }) +describe('interleaved reasoning/text coalescing', () => { + it('keeps narration contiguous when reasoning interrupts mid-sentence', () => { + // Models that interleave reasoning_content + content deltas emit + // text → reasoning → text within one tool-bounded segment. The two text + // fragments are really one sentence and must not be split by the + // "Thinking" block between them. + let parts: ChatMessagePart[] = appendAssistantTextPart([], 'Let me ') + parts = appendReasoningPart(parts, 'checking the file...') + parts = appendAssistantTextPart(parts, 'verify the full file is correct:') + + expect(parts.map(p => p.type)).toEqual(['text', 'reasoning']) + expect((parts[0] as { text: string }).text).toBe('Let me verify the full file is correct:') + expect((parts[1] as { text: string }).text).toBe('checking the file...') + }) + + it('merges reasoning bursts that straddle a narration fragment', () => { + let parts: ChatMessagePart[] = appendReasoningPart([], 'first thought ') + parts = appendAssistantTextPart(parts, 'Working on it.') + parts = appendReasoningPart(parts, 'second thought') + + expect(parts.map(p => p.type)).toEqual(['reasoning', 'text']) + expect((parts[0] as { text: string }).text).toBe('first thought second thought') + expect((parts[1] as { text: string }).text).toBe('Working on it.') + }) + + it('starts a fresh text part after a tool call (segment boundary)', () => { + let parts: ChatMessagePart[] = appendAssistantTextPart([], 'Let me check.') + parts = upsertToolPart(parts, { name: 'read_file', tool_id: 'tc-1' }, 'running') + parts = appendAssistantTextPart(parts, 'Now editing.') + + expect(parts.map(p => p.type)).toEqual(['text', 'tool-call', 'text']) + expect((parts[0] as { text: string }).text).toBe('Let me check.') + expect((parts[2] as { text: string }).text).toBe('Now editing.') + }) + + it('does not merge reasoning across a tool call', () => { + let parts: ChatMessagePart[] = appendReasoningPart([], 'before tool') + parts = upsertToolPart(parts, { name: 'read_file', tool_id: 'tc-1' }, 'running') + parts = appendReasoningPart(parts, 'after tool') + + expect(parts.map(p => p.type)).toEqual(['reasoning', 'tool-call', 'reasoning']) + expect((parts[0] as { text: string }).text).toBe('before tool') + expect((parts[2] as { text: string }).text).toBe('after tool') + }) +}) + describe('preserveLocalAssistantErrors', () => { it('preserves a local user+error pair when hydration omits the failed turn', () => { const nextMessages: ChatMessage[] = [ diff --git a/apps/desktop/src/lib/chat-messages.ts b/apps/desktop/src/lib/chat-messages.ts index e4b1f2fb162..ef4eef6662d 100644 --- a/apps/desktop/src/lib/chat-messages.ts +++ b/apps/desktop/src/lib/chat-messages.ts @@ -178,50 +178,70 @@ function displayContentForMessage(role: SessionMessage['role'], content: unknown return [refs.join('\n'), visibleText].filter(Boolean).join('\n\n') || visibleText } -export function appendTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { - const next = [...parts] - const last = next.at(-1) - - if (last?.type === 'text') { - next[next.length - 1] = { ...last, text: `${last.text}${delta}` } - - return next - } - - next.push(textPart(delta)) - - return next +const STREAM_PART: Record<'reasoning' | 'text', (text: string) => ChatMessagePart> = { + reasoning: reasoningPart, + text: textPart } -export function appendAssistantTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { - const next = appendTextPart(parts, delta) - const last = next.at(-1) +// Coalesce a streaming delta into the most recent same-type part within the +// current segment, where a segment is bounded by any non-streaming part (a +// tool call, image, …). The opposite streaming channel (text <-> reasoning) is +// transparent, so a reasoning burst between two content deltas can't shred one +// sentence into text / Thinking / text — the fragmentation models that +// interleave reasoning_content + content otherwise produce. Tool calls still +// open a fresh part, preserving narration order across steps. +function appendStreamPart( + parts: ChatMessagePart[], + type: 'reasoning' | 'text', + delta: string +): { index: number; parts: ChatMessagePart[] } { + const next = [...parts] - if (last?.type === 'text') { - const current = last.text + for (let i = next.length - 1; i >= 0; i--) { + const part = next[i] - const deltaMayContainMedia = - delta.includes('MEDIA:') || delta.includes('DIA:') || delta.includes('EDIA:') || delta.includes('IA:') + if (part.type === type) { + next[i] = { ...part, text: `${(part as { text: string }).text}${delta}` } as ChatMessagePart - const needsMediaPass = deltaMayContainMedia || current.includes('MEDIA:') - const nextText = needsMediaPass ? renderMediaTags(current) : current - next[next.length - 1] = nextText === current ? last : { ...last, text: nextText } + return { index: i, parts: next } + } + + if (part.type !== 'text' && part.type !== 'reasoning') { + break + } } - return next + next.push(STREAM_PART[type](delta)) + + return { index: next.length - 1, parts: next } +} + +export function appendTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { + return appendStreamPart(parts, 'text', delta).parts } export function appendReasoningPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { - const next = [...parts] - const last = next.at(-1) + return appendStreamPart(parts, 'reasoning', delta).parts +} - if (last?.type === 'reasoning') { - next[next.length - 1] = { ...last, text: `${last.text}${delta}` } +export function appendAssistantTextPart(parts: ChatMessagePart[], delta: string): ChatMessagePart[] { + const { index, parts: next } = appendStreamPart(parts, 'text', delta) + const part = next[index] + if (part?.type !== 'text') { return next } - next.push(reasoningPart(delta)) + const mayContainMedia = + delta.includes('MEDIA:') || delta.includes('DIA:') || delta.includes('EDIA:') || delta.includes('IA:') + + if (mayContainMedia || part.text.includes('MEDIA:')) { + const rendered = renderMediaTags(part.text) + + if (rendered !== part.text) { + next[index] = { ...part, text: rendered } + } + } return next }