From cb039ac000ed2c1ff00e18a2d9fc40a62673b531 Mon Sep 17 00:00:00 2001 From: Austin Pickett Date: Tue, 28 Apr 2026 21:20:43 -0400 Subject: [PATCH] fix: account for latex --- ui-tui/src/__tests__/mathUnicode.test.ts | 98 ++++++++++- ui-tui/src/components/markdown.tsx | 77 ++++++-- ui-tui/src/lib/mathUnicode.ts | 213 ++++++++++++++++++++++- 3 files changed, 367 insertions(+), 21 deletions(-) diff --git a/ui-tui/src/__tests__/mathUnicode.test.ts b/ui-tui/src/__tests__/mathUnicode.test.ts index f395174bd7..fb9f029aa8 100644 --- a/ui-tui/src/__tests__/mathUnicode.test.ts +++ b/ui-tui/src/__tests__/mathUnicode.test.ts @@ -1,6 +1,8 @@ import { describe, expect, it } from 'vitest' -import { texToUnicode } from '../lib/mathUnicode.js' +import { BOX_CLOSE, BOX_OPEN, BOX_RE, texToUnicode } from '../lib/mathUnicode.js' + +const stripBox = (s: string) => s.replace(BOX_RE, '$1') describe('texToUnicode — symbols', () => { it('substitutes lowercase Greek', () => { @@ -116,6 +118,100 @@ describe('texToUnicode — fractions', () => { it('handles nested fractions', () => { expect(texToUnicode('\\frac{1}{\\frac{1}{x}}')).toBe('1/(1/x)') }) + + it('handles braces inside numerator / denominator (regression: regex \\frac couldn\'t)', () => { + // The regex-only `\frac` matcher used `[^{}]*` for each arg, which + // failed the moment a numerator contained its own braces (here the + // `{p-1}` from a superscript). The balanced-brace parser handles it. + expect(texToUnicode('\\frac{|t|^{p-1}|P(t)|^p}{(p-1)!}')).toBe('(|t|ᵖ⁻¹|P(t)|ᵖ)/((p-1)!)') + }) + + it('preserves \\frac when arguments are malformed', () => { + expect(texToUnicode('\\frac{a}')).toBe('\\frac{a}') + expect(texToUnicode('\\fraction{a}{b}')).toBe('\\fraction{a}{b}') + }) +}) + +describe('texToUnicode — typography no-ops', () => { + it('strips \\displaystyle / \\textstyle / \\scriptstyle / \\scriptscriptstyle', () => { + expect(texToUnicode('\\displaystyle\\sum_{i=1}^n x_i')).toBe('∑ᵢ₌₁ⁿ xᵢ') + expect(texToUnicode('f(x) = \\displaystyle \\frac{1}{2}')).toBe('f(x) = 1/2') + expect(texToUnicode('\\textstyle x + y')).toBe('x + y') + }) + + it('strips \\limits / \\nolimits which only affect bound positioning', () => { + expect(texToUnicode('\\sum\\limits_{k=1}^n a_k')).toBe('∑ₖ₌₁ⁿ aₖ') + expect(texToUnicode('\\int\\nolimits_0^1 f(x) dx')).toBe('∫₀¹ f(x) dx') + }) + + it('does not eat letter-continuation commands like \\limit_inf', () => { + // The `(?![A-Za-z])` lookahead protects hypothetical commands that + // start with `\limit` / `\display` / etc. The bare names are stripped + // but anything longer is preserved verbatim. + expect(texToUnicode('\\limitinf x')).toBe('\\limitinf x') + }) +}) + +describe('texToUnicode — sizing wrappers', () => { + it('strips \\big / \\Big / \\bigg / \\Bigg before delimiters', () => { + expect(texToUnicode('\\bigl[ x \\bigr]')).toBe('[ x ]') + expect(texToUnicode('\\Big( y \\Big)')).toBe('( y )') + expect(texToUnicode('\\bigg| z \\bigg|')).toBe('| z |') + expect(texToUnicode('\\Biggl\\{ a \\Biggr\\}')).toBe('{ a }') + }) + + it('does not eat \\bigtriangleup or other letter-continuations', () => { + expect(texToUnicode('A \\bigtriangleup B')).toBe('A \\bigtriangleup B') + }) +}) + +describe('texToUnicode — modular arithmetic and tags', () => { + it('renders \\pmod{p} as " (mod p)"', () => { + expect(texToUnicode('a \\equiv b \\pmod{p}')).toBe('a ≡ b (mod p)') + }) + + it('renders \\bmod / \\mod inline', () => { + expect(texToUnicode('a \\bmod n')).toBe('a mod n') + }) + + it('collapses \\tag{n} to " (n)"', () => { + expect(texToUnicode('x = y \\tag{24}')).toBe('x = y (24)') + }) +}) + +describe('texToUnicode — newly added symbols', () => { + it('renders \\nmid, \\blacksquare, \\qed', () => { + expect(texToUnicode('p \\nmid q')).toBe('p ∤ q') + expect(texToUnicode('Therefore \\blacksquare')).toBe('Therefore ■') + expect(texToUnicode('done \\qed')).toBe('done ∎') + }) +}) + +describe('texToUnicode — \\boxed / \\fbox', () => { + // `\boxed` produces non-printable U+0001 / U+0002 sentinels around its + // content so the markdown renderer can apply highlight styling. These + // tests assert both the sentinel form and the human-readable + // strip-fallback (BOX_RE). + it('wraps simple boxed content in BOX_OPEN/BOX_CLOSE sentinels', () => { + expect(texToUnicode('\\boxed{x = 0}')).toBe(`${BOX_OPEN}x = 0${BOX_CLOSE}`) + expect(stripBox(texToUnicode('\\boxed{x = 0}'))).toBe('x = 0') + expect(stripBox(texToUnicode('\\fbox{answer}'))).toBe('answer') + }) + + it('handles boxed expressions with nested braces (regression: regex couldn\'t)', () => { + // A `[^{}]*` regex would stop at the first `{` inside the body. The + // balanced-brace parser walks past it. + expect(stripBox(texToUnicode('\\boxed{x^{n+1}}'))).toBe('xⁿ⁺¹') + expect(stripBox(texToUnicode('\\boxed{\\frac{a}{b}}'))).toBe('a/b') + }) + + it('handles real-world boxed final answer', () => { + expect(stripBox(texToUnicode('\\boxed{J = -\\sum_{k=0}^n a_k F(k)}'))).toBe('J = -∑ₖ₌₀ⁿ aₖ F(k)') + }) + + it('preserves \\boxed without a brace argument', () => { + expect(texToUnicode('\\boxed something')).toBe('\\boxed something') + }) }) describe('texToUnicode — combining marks', () => { diff --git a/ui-tui/src/components/markdown.tsx b/ui-tui/src/components/markdown.tsx index 46e6297426..a052bbf297 100644 --- a/ui-tui/src/components/markdown.tsx +++ b/ui-tui/src/components/markdown.tsx @@ -2,10 +2,59 @@ import { Box, Link, Text } from '@hermes/ink' import { memo, type ReactNode, useMemo } from 'react' import { ensureEmojiPresentation } from '../lib/emoji.js' -import { texToUnicode } from '../lib/mathUnicode.js' +import { BOX_CLOSE, BOX_OPEN, texToUnicode } from '../lib/mathUnicode.js' import { highlightLine, isHighlightable } from '../lib/syntax.js' import type { Theme } from '../theme.js' +// `\boxed{X}` regions in `texToUnicode` output are marked with the +// non-printable U+0001 / U+0002 sentinels. Split on them and render the +// boxed segment with `inverse + bold` so it reads as a highlighter-pen +// emphasis on top of whatever color the parent `` is using (amber +// for math). The leading / trailing space inside the highlight gives a +// one-cell visual margin so the highlight reads as a block, not a hug. +const renderMath = (text: string): ReactNode => { + if (!text.includes(BOX_OPEN)) { + return text + } + + const out: ReactNode[] = [] + let i = 0 + let key = 0 + + while (i < text.length) { + const start = text.indexOf(BOX_OPEN, i) + + if (start < 0) { + out.push(text.slice(i)) + + break + } + + if (start > i) { + out.push(text.slice(i, start)) + } + + const end = text.indexOf(BOX_CLOSE, start + 1) + + if (end < 0) { + out.push(text.slice(start)) + + break + } + + out.push( + + {' '} + {text.slice(start + 1, end)}{' '} + + ) + + i = end + 1 + } + + return out +} + const FENCE_RE = /^\s*(`{3,}|~{3,})(.*)$/ const FENCE_CLOSE_RE = /^\s*(`{3,}|~{3,})\s*$/ const HR_RE = /^ {0,3}([-*_])(?:\s*\1){2,}\s*$/ @@ -171,31 +220,39 @@ function MdInline({ t, text }: { t: Theme; text: string }) { } else if (m[6]) { parts.push( - {m[6]} + ) } else if (m[7]) { + // Code is the one wrap that does NOT recurse — inline `code` spans + // are verbatim by definition. Letting MdInline reprocess them + // would corrupt regex examples and shell snippets. parts.push( {m[7]} ) } else if (m[8] ?? m[9]) { + // Recurse into bold / italic / strike / highlight so nested + // `$...$` math (and other inline tokens) inside a `**bolded + // statement with $\mathbb{Z}$ math**` actually render. Without + // this the inner content is dropped into a single `` + // verbatim and the math renderer never sees it. parts.push( - {m[8] ?? m[9]} + ) } else if (m[10] ?? m[11]) { parts.push( - {m[10] ?? m[11]} + ) } else if (m[12]) { parts.push( - {m[12]} + ) } else if (m[13]) { @@ -236,7 +293,7 @@ function MdInline({ t, text }: { t: Theme; text: string }) { // raw LaTeX rather than vanishing. parts.push( - {texToUnicode(m[17] ?? m[18]!)} + {renderMath(texToUnicode(m[17] ?? m[18]!))} ) } @@ -456,9 +513,7 @@ function MdImpl({ compact, t, text }: MdProps) { start('code') nodes.push( - ─ math - - {inner ? {texToUnicode(inner)} : null} + {inner ? {renderMath(texToUnicode(inner))} : null} ) i++ @@ -504,11 +559,9 @@ function MdImpl({ compact, t, text }: MdProps) { start('code') nodes.push( - ─ math - {block.map((l, j) => ( - {texToUnicode(l)} + {renderMath(texToUnicode(l))} ))} diff --git a/ui-tui/src/lib/mathUnicode.ts b/ui-tui/src/lib/mathUnicode.ts index 162cc265a7..7c6f8939ce 100644 --- a/ui-tui/src/lib/mathUnicode.ts +++ b/ui-tui/src/lib/mathUnicode.ts @@ -136,6 +136,20 @@ const SYMBOLS: Record = { '\\models': '⊨', '\\vdash': '⊢', '\\mid': '∣', + '\\nmid': '∤', + '\\divides': '∣', + + // Common standalone glyphs + '\\blacksquare': '■', + '\\square': '□', + '\\Box': '□', + '\\qed': '∎', + '\\bigstar': '★', + + // Modular arithmetic — the `\pmod{p}` form (with arg) is handled below; + // the bare `\bmod` / `\mod` commands are simple text substitutions. + '\\bmod': 'mod', + '\\mod': 'mod', // Brackets / fences (named delimiter commands; the `\left\X` / `\right\X` // unwrapping below leaves these behind for the symbol pass to resolve). @@ -403,6 +417,14 @@ const SUBSCRIPT: Record = { x: 'ₓ' } +// Sentinel control characters used to mark `\boxed` / `\fbox` regions in +// the converted output. The renderer splits on these to apply a highlight +// style; consumers that don't want highlighting can strip them with the +// exported `BOX_RE` below. +export const BOX_OPEN = '\u0001' +export const BOX_CLOSE = '\u0002' +export const BOX_RE = /\u0001([^\u0001\u0002]*)\u0002/g + const escapeRe = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') // Pre-compile two symbol regexes: one for letter-ending commands (`\pi`, @@ -473,6 +495,154 @@ const convertScript = (input: string, table: Record, sigil: '^' return `${sigil}(${trimmed})` } +// Walk the string and parse `{...}` honouring nested braces. Unlike a +// `\{[^{}]*\}` regex this survives `\frac{|t|^{p-1}|P(t)|^p}{...}` where +// the numerator contains its own braces from a superscript. Returns the +// inner content (without the outer braces) and the offset just past the +// closing `}`. Returns null if there is no balanced brace at `start`. +const readBraced = (s: string, start: number): { content: string; end: number } | null => { + if (s[start] !== '{') { + return null + } + + let depth = 1 + let i = start + 1 + + while (i < s.length && depth > 0) { + const c = s[i] + + // Skip escapes — `\{` and `\}` inside a body are literal braces and + // should not change the brace counter. + if (c === '\\' && i + 1 < s.length) { + i += 2 + continue + } + + if (c === '{') { + depth++ + } else if (c === '}') { + depth-- + } + + if (depth > 0) { + i++ + } + } + + if (depth !== 0) { + return null + } + + return { content: s.slice(start + 1, i), end: i + 1 } +} + +// Replace every occurrence of `\command{arg}` using balanced-brace parsing +// (so `\boxed{x^{n+1}}` works where a `[^{}]*` regex would fail). The +// `render` callback receives the inner content already recursed-into, so +// `\boxed{\boxed{x}}` resolves outside-in cleanly. Unmatched `\command` +// (no following `{...}`) is preserved verbatim. +const replaceBracedCommand = (input: string, command: string, render: (content: string) => string): string => { + const cmdLen = command.length + let out = '' + let i = 0 + + while (i < input.length) { + const idx = input.indexOf(command, i) + + if (idx < 0) { + out += input.slice(i) + + return out + } + + const after = input[idx + cmdLen] + + if (after && /[A-Za-z]/.test(after)) { + out += input.slice(i, idx + cmdLen) + i = idx + cmdLen + continue + } + + out += input.slice(i, idx) + + let p = idx + cmdLen + + while (input[p] === ' ' || input[p] === '\t') p++ + + const arg = readBraced(input, p) + + if (!arg) { + out += input.slice(idx, p + 1) + i = p + 1 + continue + } + + out += render(replaceBracedCommand(arg.content, command, render)) + i = arg.end + } + + return out +} + +// Replace every `\frac{num}{den}` with `num/den` (parens around either +// side when its precedence demands it). The recursion handles nested +// fractions naturally: `\frac{1}{\frac{1}{x}}` collapses to `1/(1/x)` +// because we recurse into `den` before deciding whether to parenthesise. +const replaceFracs = (input: string): string => { + let out = '' + let i = 0 + + while (i < input.length) { + const idx = input.indexOf('\\frac', i) + + if (idx < 0) { + out += input.slice(i) + + return out + } + + const after = input[idx + 5] + + // `(?![A-Za-z])` — protect hypothetical commands like `\fraction`. + if (after && /[A-Za-z]/.test(after)) { + out += input.slice(i, idx + 5) + i = idx + 5 + continue + } + + out += input.slice(i, idx) + + let p = idx + 5 + + while (input[p] === ' ' || input[p] === '\t') p++ + + const num = readBraced(input, p) + + if (!num) { + out += input.slice(idx, p + 1) + i = p + 1 + continue + } + + p = num.end + + while (input[p] === ' ' || input[p] === '\t') p++ + + const den = readBraced(input, p) + + if (!den) { + out += input.slice(idx, p + 1) + i = p + 1 + continue + } + + out += `${wrapForFrac(replaceFracs(num.content))}/${wrapForFrac(replaceFracs(den.content))}` + i = den.end + } + + return out +} + // Wrap multi-token expressions in parens so `\frac{a+b}{c}` becomes // `(a+b)/c` rather than `a+b/c`. We only wrap when the expression has // loose precedence — additive operators or whitespace that would change @@ -516,15 +686,18 @@ export function texToUnicode(input: string): string { s = s.replace(/\\dot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0307`) s = s.replace(/\\ddot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0308`) - // Apply \frac repeatedly so nested fractions resolve from the inside - // out — `\frac{1}{1+\frac{1}{x}}` collapses cleanly. - let prev = '' - let guard = 0 + s = replaceFracs(s) - while (s !== prev && guard++ < 8) { - prev = s - s = s.replace(/\\frac\s*\{([^{}]*)\}\s*\{([^{}]*)\}/g, (_, num: string, den: string) => `${wrapForFrac(num)}/${wrapForFrac(den)}`) - } + // `\boxed{X}` / `\fbox{X}` highlight a final answer. Terminals can't + // draw a real box, so we wrap the content in U+0001 / U+0002 control + // characters — non-printable, never present in real text — and let the + // markdown renderer split on them and apply a highlight style (inverse + // video) to the bracketed region. This keeps `texToUnicode` pure-string + // while letting the React layer do the actual visual emphasis. + // Argument is parsed with balanced braces so nested `{...}` from + // superscripts / fractions inside the box survive. + s = replaceBracedCommand(s, '\\boxed', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`) + s = replaceBracedCommand(s, '\\fbox', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`) // `\xrightarrow{label}` / `\xleftarrow{label}` collapse to an arrow with // the label inline. LaTeX renders the label above the arrow; in monospace @@ -537,6 +710,30 @@ export function texToUnicode(input: string): string { s = s.replace(/\\Longleftarrow/g, '⟸') s = s.replace(/\\Longleftrightarrow/g, '⟺') + // `\pmod{p}` → ` (mod p)` (LaTeX adds parens automatically); `\pod{p}` + // is a paren-less variant; `\tag{n}` is the equation-number annotation + // shown to the right of an equation. Collapse to a single-space-prefixed + // bracketed form. The leading `\s*` in the pattern absorbs any whitespace + // already in the source so we don't end up with `b (mod p)` (double + // space) when the user wrote `b \pmod{p}`. + s = s.replace(/\s*\\pmod\s*\{([^{}]*)\}/g, (_, p: string) => ` (mod ${p.trim()})`) + s = s.replace(/\s*\\pod\s*\{([^{}]*)\}/g, (_, p: string) => ` (${p.trim()})`) + s = s.replace(/\s*\\tag\s*\{([^{}]*)\}/g, (_, n: string) => ` (${n.trim()})`) + + // `\big`, `\Big`, `\bigg`, `\Bigg` (with optional `l`/`r`/`m` suffix) + // are sizing wrappers analogous to `\left`/`\right` but without the + // automatic-pairing semantics. Strip them and leave whatever delimiter + // follows. The trailing `(?![A-Za-z])` protects `\bigtriangleup` and + // any other letter-continuation command from being shaved. + s = s.replace(/\\(?:Bigg|bigg|Big|big)[lrm]?(?![A-Za-z])/g, '') + + // Style / size hints that don't typeset any glyph and only affect how + // things would be sized in a real LaTeX engine. In a terminal every + // glyph is one monospace cell, so there's nothing to do — drop them + // (with any trailing whitespace) so they don't leak through as raw + // `\displaystyle` in the output. + s = s.replace(/\\(?:scriptscriptstyle|displaystyle|scriptstyle|textstyle|nolimits|limits)(?![A-Za-z])\s*/g, '') + // `\left` and `\right` are sizing wrappers around any delimiter — bare // (`\left(`), escaped (`\left\{`), or named (`\left\langle`). Strip the // wrapper unconditionally and let the rest of the pipeline (or the