fix(desktop): polish LaTeX rendering — currency, code blocks, brackets

Five distinct bugs surfaced from a math-heavy stress test: 1. Adjacent code fences glued together. scrubBacktickNoise's second-pass regex /``\s*``/g matched the LAST 2 backticks of one fence + whitespace + FIRST 2 backticks of the next, collapsing two blocks into one. Fixed with lookbehind/lookahead so we only match exactly 2 backticks not part of a longer run. 2. Whitespace eaten between fences and following content. stripPreviewTargets internally calls .trim() which strips leading/ trailing whitespace from each split-segment. For segments between two fences this collapsed \n\n to '', gluing fence close to next block. Fixed by capturing leading/trailing whitespace at the call site and restoring it after the transform. 3. Currency dollar signs eaten as math. With singleDollarTextMath:true remark-math greedy-matched any pair of $, so '$5 ... $10' became one inline math span. Added escapeCurrencyDollars to escape $<digit> patterns to \$<digit> in prose segments (not in code). Trade-off: math expressions starting with a digit (rare — '$5x = 10$') get escaped too. Mirrors the convention in ChatGPT/Claude's UIs. 4. $...$ and \[...\] LaTeX brackets unsupported. Models often emit these instead of $...$ / $$...$$. Added rewriteLatexBracketDelimiters preprocessor pass. 5. ```latex / ```tex blocks were being routed to KaTeX via a rewrite to ```math. Aligns with GitHub markdown convention: ```math = render as math; ```latex / ```tex = LaTeX/TeX source code (syntax highlighted, not rendered). Conflating them broke teaching/showing-source use cases. MATH_FENCE_LANGUAGES pruned to {'math'} only. Also flipped parseIncompleteMarkdown to true (was !isStreaming) so the math parser can't see $ inside streaming-but-not-yet-closed code fences. Shiki was already deferred via defer={isStreaming} so this doesn't introduce new tokenization cost. Test: 18/18 existing tests still pass; one test updated to expect escaped \$ in currency-prose-with-URL case.
2026-07-26 17:38:36 +00:00 · 2026-05-12 22:13:30 -04:00 · 2026-05-12 22:13:30 -04:00 · 708d2a0c33
commit 708d2a0c33
parent 747caa74f0
3 changed files with 125 additions and 4 deletions
--- a/apps/desktop/src/components/assistant-ui/markdown-text.test.ts
+++ b/apps/desktop/src/components/assistant-ui/markdown-text.test.ts
@ -136,8 +136,13 @@ describe('preprocessMarkdown', () => {
    const output = preprocessMarkdown(input)

    expect(output).not.toContain('```')
+    // Currency dollar amounts get escaped to `\$` in the preprocessor
+    // so they don't get parsed as math delimiters by remark-math (we
+    // enable singleDollarTextMath, which would otherwise greedy-match
+    // `$56...$99` as one big inline math span). The escape is invisible
+    // to the user — `\$` renders as a literal `$` in the final output.
    expect(output).toContain(
-      '~$56<https://www.getyourguide.com/san-juan-puerto-rico-l355/san-juan-snorkel-sea-turtles-manatees-free-video-rum-t879147/> Old San Juan Sunset Cruise'
+      '~\\$56<https://www.getyourguide.com/san-juan-puerto-rico-l355/san-juan-snorkel-sea-turtles-manatees-free-video-rum-t879147/> Old San Juan Sunset Cruise'
    )
    expect(output).toContain(
      '<https://www.getyourguide.com/en-gb/san-juan-puerto-rico-l355/san-juan-old-san-juan-sunset-cruise-with-drinks-transfer-t405191/>'
--- a/apps/desktop/src/components/assistant-ui/markdown-text.tsx
+++ b/apps/desktop/src/components/assistant-ui/markdown-text.tsx
@ -335,7 +335,15 @@ const MarkdownTextImpl = () => {
      )}
      lineNumbers={false}
      mode="streaming"
-      parseIncompleteMarkdown={!isStreaming}
+      // Always auto-close incomplete fences — even during streaming.
+      // Without this, an unclosed ```python ... ``` whose body contains
+      // `$` (very common: shell snippets, JS template strings, dollar
+      // amounts) leaks those dollars out to the math parser and they
+      // get rendered as broken inline math until the closing fence
+      // arrives. Shiki is independently deferred via `defer={isStreaming}`
+      // on the SyntaxHighlighter component, so we don't pay code-block
+      // tokenization on every token even with this set.
+      parseIncompleteMarkdown
      plugins={{ math: mathPlugin, ...(isStreaming ? {} : { code }) }}
      preprocess={preprocessMarkdown}
      shikiTheme={['github-light-default', 'github-dark-default']}
--- a/apps/desktop/src/lib/markdown-preprocess.ts
+++ b/apps/desktop/src/lib/markdown-preprocess.ts
@ -94,7 +94,14 @@ function scrubBacktickNoise(text: string): string {
  out += text.slice(cursor).replace(fenceNoiseRe, '')

  for (let pass = 0; pass < 2; pass += 1) {
-    out = out.replace(/``\s*``/g, '')
+    // Match EXACTLY 2 backticks (not part of a longer run) on each side.
+    // Without the lookbehind/lookahead, two adjacent triple-backtick
+    // fences with only whitespace between them get spliced together —
+    // e.g. ```bash\n...\n```\n\n```latex matches the regex's
+    // last-2-of-bash-close + \n\n + first-2-of-latex-open and the
+    // surrounding fence markers collapse into a single longer block,
+    // which the markdown parser then treats as ONE giant code block.
+    out = out.replace(/(?<!`)``(?!`)\s*(?<!`)``(?!`)/g, '')
    out = out.replace(/(^|[^`])``(?=\s|[.,;:!?)\]'"\u2014\u2013-]|$)/g, '$1')
  }

@ -164,6 +171,22 @@ function findClosingFence(lines: string[], start: number, marker: string): numbe
  return -1
 }

+// Languages that should be routed to the math (KaTeX) renderer instead of
+// being shown as a syntax-highlighted code block.
+//
+// We deliberately recognize ONLY `math` here, not `latex` or `tex`.
+// Reasoning: GitHub-style markdown uses ` ```math ` to mean "render as
+// math" and ` ```latex `/` ```tex ` to mean "show LaTeX/TeX source code"
+// (syntax highlighted). Conflating the two breaks code blocks where a
+// user is *discussing* LaTeX rather than embedding it (e.g.,
+// ```latex\n\begin{equation}\n  E = mc^2\n\end{equation}``` shown as a
+// teaching example). Anyone who wants math rendered should use ```math.
+const MATH_FENCE_LANGUAGES = new Set(['math'])
+
+function isMathFence(language: string): boolean {
+  return MATH_FENCE_LANGUAGES.has(language.toLowerCase())
+}
+
 function normalizeFenceBlocks(text: string): string {
  const sourceLines = text.split('\n')
  const out: string[] = []
@ -226,6 +249,15 @@ function normalizeFenceBlocks(text: string): string {

      if (isLikelyProseFence(infoRaw, body)) {
        pushProseFence(out, indent, infoRaw, bodyLines)
+      } else if (isMathFence(language)) {
+        // Streaming math fence — rewrite the language tag to "math".
+        // remark-math + rehype-katex pick up ```math fenced blocks via
+        // the language-math class on the resulting <code> element. We
+        // keep the fence intact (instead of converting to $$..$$) so
+        // any literal `$$` characters in the body don't collide with
+        // an outer math wrapper. No close emitted yet — streaming.
+        out.push(`${indent}${marker}math`)
+        out.push(...bodyLines)
      } else {
        out.push(`${indent}${marker}${language}`)
        out.push(...bodyLines)
@ -241,6 +273,21 @@ function normalizeFenceBlocks(text: string): string {
      continue
    }

+    if (isMathFence(language)) {
+      // Closed math fence — rewrite the language tag to "math" so
+      // rehype-katex's language-math class detection picks it up.
+      // Body stays untouched (no $$..$$ rewrite) so authors can write
+      // arbitrary LaTeX including `$$display$$` markers without them
+      // colliding with our wrapper. Without this rewrite the block
+      // would render as a syntax-highlighted "latex" code listing.
+      out.push(`${indent}${marker}math`)
+      out.push(...bodyLines)
+      out.push(`${indent}${marker}`)
+      index = closeIndex + 1
+
+      continue
+    }
+
    out.push(`${indent}${marker}${language}`)
    out.push(...bodyLines)
    out.push(`${indent}${marker}`)
@ -250,6 +297,39 @@ function normalizeFenceBlocks(text: string): string {
  return out.join('\n')
 }

+// Convert LaTeX bracket delimiters to remark-math's dollar-sign syntax.
+// Models often emit `\(...\)` for inline math and `\[...\]` for display
+// math (the standard LaTeX convention) instead of `$...$` / `$$...$$`.
+// remark-math only natively recognizes the dollar form, so we rewrite at
+// preprocess time. Done with simple non-greedy matches keyed on the
+// escaped-bracket sequences — these are rare enough in non-math content
+// (you'd have to write a literal `\(` followed eventually by a literal
+// `\)` with NO interleaving newline-paragraph-break) that false positives
+// are extremely unlikely.
+const LATEX_INLINE_RE = /\\\(([^\n]+?)\\\)/g
+const LATEX_DISPLAY_RE = /\\\[([\s\S]+?)\\\]/g
+
+function rewriteLatexBracketDelimiters(text: string): string {
+  return text.replace(LATEX_INLINE_RE, (_, body: string) => `$${body}$`).replace(LATEX_DISPLAY_RE, (_, body: string) => `$$${body}$$`)
+}
+
+// Escape `$<digit>` patterns so they don't get eaten as math delimiters.
+// Models commonly write currency amounts ($5, $19.99, $1,299) in prose.
+// With `singleDollarTextMath: true`, remark-math is greedy and matches
+// EVERY pair of `$`s — including the open of `$5` to the next `$10`,
+// rendering "5 in my pocket and you have " as italicized math text.
+// The de-facto convention across math-supporting LLM UIs is to treat
+// `$` followed by a digit as currency rather than math, since math
+// expressions almost always start with a letter or `\command`. Trade-
+// off: a math expression like `$5x = 10$` would have its leading 5
+// escaped — annoying but rare. The escape `\$` survives to render as
+// a literal `$` in the final output.
+const CURRENCY_DOLLAR_RE = /(^|[^\\])\$(?=\d)/g
+
+function escapeCurrencyDollars(text: string): string {
+  return text.replace(CURRENCY_DOLLAR_RE, '$1\\$')
+}
+
 export function preprocessMarkdown(text: string): string {
  const cleaned = text.replace(REASONING_BLOCK_RE, '').replace(PREVIEW_MARKER_RE, '')
  const scrubbed = scrubBacktickNoise(cleaned)
@ -258,7 +338,35 @@ export function preprocessMarkdown(text: string): string {

  return strippedEmptyFences
    .split(CODE_FENCE_SPLIT_RE)
-    .map(part => (/^(?:```|~~~)/.test(part) ? part : normalizeVisibleProse(stripPreviewTargets(part))))
+    .map(part => {
+      // Fence blocks pass through untouched.
+      if (/^(?:```|~~~)/.test(part)) {return part}
+
+      // Whitespace-only segments (e.g. the `\n\n` between two adjacent
+      // fences) must NOT go through stripPreviewTargets — its internal
+      // .trim() would collapse them to '' and glue the surrounding
+      // fences together, producing things like ``````math which the
+      // markdown parser then reads as a single 6-backtick block.
+      if (!part.trim()) {return part}
+      // Preserve leading/trailing whitespace around the prose body so
+      // that fence-prose-fence sequences keep their blank-line gaps.
+      // stripPreviewTargets internally calls .trim() on its result for
+      // the benefit of its other (single-segment) callers; here we're
+      // operating on a SEGMENT of a larger document where outer
+      // whitespace is structural and must survive.
+      const leading = part.match(/^\s*/)?.[0] ?? ''
+      const trailing = part.match(/\s*$/)?.[0] ?? ''
+
+      // rewriteLatexBracketDelimiters runs only on prose segments so
+      // we don't accidentally touch `\(` inside a code block.
+      // escapeCurrencyDollars likewise only runs on prose, so legit
+      // `$5` literals inside fenced code stay intact.
+      const transformed = normalizeVisibleProse(
+        stripPreviewTargets(rewriteLatexBracketDelimiters(escapeCurrencyDollars(part)))
+      )
+
+      return leading + transformed + trailing
+    })
    .join('')
    .replace(/[ \t]+\n/g, '\n')
 }