mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-12 03:42:08 +00:00
Address two Copilot review comments on PR #17175. - `wrapForFrac` doc said "additive operators or whitespace" but the implementation also matches `*` and `/`. The wider behaviour is the one we want (nested products and fractions need parens to disambiguate inline `/`), so the doc is updated to match instead of tightening the regex. - `fenceOpenAt` was flagged as "overly conservative" vs. `markdown.tsx`, which falls back to paragraph rendering for unclosed `$$` openers. Mirroring that fallback in the streaming chunker would prematurely commit a paragraph rendering of the unclosed opener to the monotonic stable prefix, where it would be frozen and become wrong the moment the closer streams in. The asymmetry is deliberate; document why so it isn't "fixed" again later. Made-with: Cursor
770 lines
20 KiB
TypeScript
770 lines
20 KiB
TypeScript
// Best-effort LaTeX → Unicode for inline / display math captured by the
|
||
// markdown renderer. The terminal can't typeset LaTeX, but Unicode covers
|
||
// most of what models actually emit: Greek letters, blackboard / fraktur /
|
||
// calligraphic capitals, set theory + logic operators, common arrows,
|
||
// sub/superscripts, and `\frac{a}{b}` collapsed to `a/b`.
|
||
//
|
||
// Design rules:
|
||
// • Pure regex pipeline. Anything we don't recognise is preserved
|
||
// verbatim (so a `\foo{bar}` we've never heard of still survives).
|
||
// A real LaTeX parser would be more correct but throws on partial
|
||
// input — terminal users would rather see the raw command than a
|
||
// parse-error placeholder.
|
||
// • Longest-match-first ordering on commands so `\le` doesn't shadow
|
||
// `\leq`, `\sub` doesn't shadow `\subseteq`, etc.
|
||
// • Word-boundary lookahead `(?![A-Za-z])` after each command so
|
||
// `\pix` (made-up command) doesn't get partially substituted as `π`.
|
||
// • `\mathbb{X}`, `\mathcal{X}`, `\mathfrak{X}` only handle a single
|
||
// letter argument — multi-letter `\mathbb{NN}` is rare and would
|
||
// need a real parser to do correctly.
|
||
// • Sub/super scripts only convert if EVERY character has a Unicode
|
||
// equivalent. Mixed content like `^{n+1}` falls back to the raw
|
||
// LaTeX so we don't emit `ⁿ+¹` (which has no `+` superscript glyph
|
||
// in some fonts and reads worse than the source).
|
||
|
||
const SYMBOLS: Record<string, string> = {
|
||
// Greek lowercase
|
||
'\\alpha': 'α',
|
||
'\\beta': 'β',
|
||
'\\gamma': 'γ',
|
||
'\\delta': 'δ',
|
||
'\\epsilon': 'ε',
|
||
'\\varepsilon': 'ε',
|
||
'\\zeta': 'ζ',
|
||
'\\eta': 'η',
|
||
'\\theta': 'θ',
|
||
'\\vartheta': 'ϑ',
|
||
'\\iota': 'ι',
|
||
'\\kappa': 'κ',
|
||
'\\lambda': 'λ',
|
||
'\\mu': 'μ',
|
||
'\\nu': 'ν',
|
||
'\\xi': 'ξ',
|
||
'\\pi': 'π',
|
||
'\\varpi': 'ϖ',
|
||
'\\rho': 'ρ',
|
||
'\\varrho': 'ϱ',
|
||
'\\sigma': 'σ',
|
||
'\\varsigma': 'ς',
|
||
'\\tau': 'τ',
|
||
'\\upsilon': 'υ',
|
||
'\\phi': 'φ',
|
||
'\\varphi': 'φ',
|
||
'\\chi': 'χ',
|
||
'\\psi': 'ψ',
|
||
'\\omega': 'ω',
|
||
|
||
// Greek uppercase
|
||
'\\Gamma': 'Γ',
|
||
'\\Delta': 'Δ',
|
||
'\\Theta': 'Θ',
|
||
'\\Lambda': 'Λ',
|
||
'\\Xi': 'Ξ',
|
||
'\\Pi': 'Π',
|
||
'\\Sigma': 'Σ',
|
||
'\\Upsilon': 'Υ',
|
||
'\\Phi': 'Φ',
|
||
'\\Psi': 'Ψ',
|
||
'\\Omega': 'Ω',
|
||
|
||
// Big operators
|
||
'\\sum': '∑',
|
||
'\\prod': '∏',
|
||
'\\coprod': '∐',
|
||
'\\int': '∫',
|
||
'\\iint': '∬',
|
||
'\\iiint': '∭',
|
||
'\\oint': '∮',
|
||
'\\bigcup': '⋃',
|
||
'\\bigcap': '⋂',
|
||
'\\bigvee': '⋁',
|
||
'\\bigwedge': '⋀',
|
||
'\\bigoplus': '⨁',
|
||
'\\bigotimes': '⨂',
|
||
|
||
// Calculus
|
||
'\\partial': '∂',
|
||
'\\nabla': '∇',
|
||
'\\sqrt': '√',
|
||
|
||
// Sets
|
||
'\\emptyset': '∅',
|
||
'\\varnothing': '∅',
|
||
'\\infty': '∞',
|
||
'\\in': '∈',
|
||
'\\notin': '∉',
|
||
'\\ni': '∋',
|
||
'\\subset': '⊂',
|
||
'\\supset': '⊃',
|
||
'\\subseteq': '⊆',
|
||
'\\supseteq': '⊇',
|
||
'\\subsetneq': '⊊',
|
||
'\\supsetneq': '⊋',
|
||
'\\cup': '∪',
|
||
'\\cap': '∩',
|
||
'\\setminus': '∖',
|
||
'\\complement': '∁',
|
||
|
||
// Logic
|
||
'\\forall': '∀',
|
||
'\\exists': '∃',
|
||
'\\nexists': '∄',
|
||
'\\land': '∧',
|
||
'\\lor': '∨',
|
||
'\\lnot': '¬',
|
||
'\\neg': '¬',
|
||
'\\therefore': '∴',
|
||
'\\because': '∵',
|
||
|
||
// Relations
|
||
'\\le': '≤',
|
||
'\\leq': '≤',
|
||
'\\ge': '≥',
|
||
'\\geq': '≥',
|
||
'\\ne': '≠',
|
||
'\\neq': '≠',
|
||
'\\ll': '≪',
|
||
'\\gg': '≫',
|
||
'\\approx': '≈',
|
||
'\\equiv': '≡',
|
||
'\\cong': '≅',
|
||
'\\sim': '∼',
|
||
'\\simeq': '≃',
|
||
'\\propto': '∝',
|
||
'\\perp': '⊥',
|
||
'\\parallel': '∥',
|
||
'\\models': '⊨',
|
||
'\\vdash': '⊢',
|
||
'\\mid': '∣',
|
||
'\\nmid': '∤',
|
||
'\\divides': '∣',
|
||
|
||
// Common standalone glyphs
|
||
'\\blacksquare': '■',
|
||
'\\square': '□',
|
||
'\\Box': '□',
|
||
'\\qed': '∎',
|
||
'\\bigstar': '★',
|
||
|
||
// Modular arithmetic — the `\pmod{p}` form (with arg) is handled below;
|
||
// the bare `\bmod` / `\mod` commands are simple text substitutions.
|
||
'\\bmod': 'mod',
|
||
'\\mod': 'mod',
|
||
|
||
// Brackets / fences (named delimiter commands; the `\left\X` / `\right\X`
|
||
// unwrapping below leaves these behind for the symbol pass to resolve).
|
||
'\\langle': '⟨',
|
||
'\\rangle': '⟩',
|
||
'\\lceil': '⌈',
|
||
'\\rceil': '⌉',
|
||
'\\lfloor': '⌊',
|
||
'\\rfloor': '⌋',
|
||
'\\|': '‖',
|
||
|
||
// Arrows
|
||
'\\to': '→',
|
||
'\\rightarrow': '→',
|
||
'\\leftarrow': '←',
|
||
'\\leftrightarrow': '↔',
|
||
'\\Rightarrow': '⇒',
|
||
'\\Leftarrow': '⇐',
|
||
'\\Leftrightarrow': '⇔',
|
||
'\\implies': '⟹',
|
||
'\\impliedby': '⟸',
|
||
'\\iff': '⟺',
|
||
'\\mapsto': '↦',
|
||
'\\hookrightarrow': '↪',
|
||
'\\hookleftarrow': '↩',
|
||
'\\uparrow': '↑',
|
||
'\\downarrow': '↓',
|
||
'\\updownarrow': '↕',
|
||
|
||
// Binary operators
|
||
'\\cdot': '⋅',
|
||
'\\cdots': '⋯',
|
||
'\\ldots': '…',
|
||
'\\dots': '…',
|
||
'\\dotsb': '…',
|
||
'\\dotsc': '…',
|
||
'\\vdots': '⋮',
|
||
'\\ddots': '⋱',
|
||
'\\times': '×',
|
||
'\\div': '÷',
|
||
'\\pm': '±',
|
||
'\\mp': '∓',
|
||
'\\circ': '∘',
|
||
'\\bullet': '•',
|
||
'\\star': '⋆',
|
||
'\\ast': '∗',
|
||
'\\oplus': '⊕',
|
||
'\\ominus': '⊖',
|
||
'\\otimes': '⊗',
|
||
'\\odot': '⊙',
|
||
'\\diamond': '⋄',
|
||
'\\angle': '∠',
|
||
'\\triangle': '△',
|
||
|
||
// Spacing — collapse to varying widths of regular space
|
||
'\\,': ' ',
|
||
'\\;': ' ',
|
||
'\\:': ' ',
|
||
'\\!': '',
|
||
'\\ ': ' ',
|
||
'\\quad': ' ',
|
||
'\\qquad': ' ',
|
||
|
||
// Functions (LaTeX renders these in roman; we just keep the name)
|
||
'\\sin': 'sin',
|
||
'\\cos': 'cos',
|
||
'\\tan': 'tan',
|
||
'\\cot': 'cot',
|
||
'\\sec': 'sec',
|
||
'\\csc': 'csc',
|
||
'\\arcsin': 'arcsin',
|
||
'\\arccos': 'arccos',
|
||
'\\arctan': 'arctan',
|
||
'\\sinh': 'sinh',
|
||
'\\cosh': 'cosh',
|
||
'\\tanh': 'tanh',
|
||
'\\log': 'log',
|
||
'\\ln': 'ln',
|
||
'\\exp': 'exp',
|
||
'\\det': 'det',
|
||
'\\dim': 'dim',
|
||
'\\ker': 'ker',
|
||
'\\lim': 'lim',
|
||
'\\liminf': 'liminf',
|
||
'\\limsup': 'limsup',
|
||
'\\sup': 'sup',
|
||
'\\inf': 'inf',
|
||
'\\max': 'max',
|
||
'\\min': 'min',
|
||
'\\arg': 'arg',
|
||
'\\gcd': 'gcd',
|
||
|
||
// Escaped literals — model occasionally emits these for display
|
||
'\\&': '&',
|
||
'\\%': '%',
|
||
'\\$': '$',
|
||
'\\#': '#',
|
||
'\\_': '_',
|
||
'\\{': '{',
|
||
'\\}': '}'
|
||
}
|
||
|
||
const BB: Record<string, string> = {
|
||
A: '𝔸',
|
||
B: '𝔹',
|
||
C: 'ℂ',
|
||
D: '𝔻',
|
||
E: '𝔼',
|
||
F: '𝔽',
|
||
G: '𝔾',
|
||
H: 'ℍ',
|
||
I: '𝕀',
|
||
J: '𝕁',
|
||
K: '𝕂',
|
||
L: '𝕃',
|
||
M: '𝕄',
|
||
N: 'ℕ',
|
||
O: '𝕆',
|
||
P: 'ℙ',
|
||
Q: 'ℚ',
|
||
R: 'ℝ',
|
||
S: '𝕊',
|
||
T: '𝕋',
|
||
U: '𝕌',
|
||
V: '𝕍',
|
||
W: '𝕎',
|
||
X: '𝕏',
|
||
Y: '𝕐',
|
||
Z: 'ℤ'
|
||
}
|
||
|
||
const CAL: Record<string, string> = {
|
||
A: '𝒜',
|
||
B: 'ℬ',
|
||
C: '𝒞',
|
||
D: '𝒟',
|
||
E: 'ℰ',
|
||
F: 'ℱ',
|
||
G: '𝒢',
|
||
H: 'ℋ',
|
||
I: 'ℐ',
|
||
J: '𝒥',
|
||
K: '𝒦',
|
||
L: 'ℒ',
|
||
M: 'ℳ',
|
||
N: '𝒩',
|
||
O: '𝒪',
|
||
P: '𝒫',
|
||
Q: '𝒬',
|
||
R: 'ℛ',
|
||
S: '𝒮',
|
||
T: '𝒯',
|
||
U: '𝒰',
|
||
V: '𝒱',
|
||
W: '𝒲',
|
||
X: '𝒳',
|
||
Y: '𝒴',
|
||
Z: '𝒵'
|
||
}
|
||
|
||
const FRAK: Record<string, string> = {
|
||
A: '𝔄',
|
||
B: '𝔅',
|
||
C: 'ℭ',
|
||
D: '𝔇',
|
||
E: '𝔈',
|
||
F: '𝔉',
|
||
G: '𝔊',
|
||
H: 'ℌ',
|
||
I: 'ℑ',
|
||
J: '𝔍',
|
||
K: '𝔎',
|
||
L: '𝔏',
|
||
M: '𝔐',
|
||
N: '𝔑',
|
||
O: '𝔒',
|
||
P: '𝔓',
|
||
Q: '𝔔',
|
||
R: 'ℜ',
|
||
S: '𝔖',
|
||
T: '𝔗',
|
||
U: '𝔘',
|
||
V: '𝔙',
|
||
W: '𝔚',
|
||
X: '𝔛',
|
||
Y: '𝔜',
|
||
Z: 'ℨ'
|
||
}
|
||
|
||
const SUPERSCRIPT: Record<string, string> = {
|
||
'0': '⁰',
|
||
'1': '¹',
|
||
'2': '²',
|
||
'3': '³',
|
||
'4': '⁴',
|
||
'5': '⁵',
|
||
'6': '⁶',
|
||
'7': '⁷',
|
||
'8': '⁸',
|
||
'9': '⁹',
|
||
'+': '⁺',
|
||
'-': '⁻',
|
||
'=': '⁼',
|
||
'(': '⁽',
|
||
')': '⁾',
|
||
a: 'ᵃ',
|
||
b: 'ᵇ',
|
||
c: 'ᶜ',
|
||
d: 'ᵈ',
|
||
e: 'ᵉ',
|
||
f: 'ᶠ',
|
||
g: 'ᵍ',
|
||
h: 'ʰ',
|
||
i: 'ⁱ',
|
||
j: 'ʲ',
|
||
k: 'ᵏ',
|
||
l: 'ˡ',
|
||
m: 'ᵐ',
|
||
n: 'ⁿ',
|
||
o: 'ᵒ',
|
||
p: 'ᵖ',
|
||
r: 'ʳ',
|
||
s: 'ˢ',
|
||
t: 'ᵗ',
|
||
u: 'ᵘ',
|
||
v: 'ᵛ',
|
||
w: 'ʷ',
|
||
x: 'ˣ',
|
||
y: 'ʸ',
|
||
z: 'ᶻ'
|
||
}
|
||
|
||
const SUBSCRIPT: Record<string, string> = {
|
||
'0': '₀',
|
||
'1': '₁',
|
||
'2': '₂',
|
||
'3': '₃',
|
||
'4': '₄',
|
||
'5': '₅',
|
||
'6': '₆',
|
||
'7': '₇',
|
||
'8': '₈',
|
||
'9': '₉',
|
||
'+': '₊',
|
||
'-': '₋',
|
||
'=': '₌',
|
||
'(': '₍',
|
||
')': '₎',
|
||
a: 'ₐ',
|
||
e: 'ₑ',
|
||
h: 'ₕ',
|
||
i: 'ᵢ',
|
||
j: 'ⱼ',
|
||
k: 'ₖ',
|
||
l: 'ₗ',
|
||
m: 'ₘ',
|
||
n: 'ₙ',
|
||
o: 'ₒ',
|
||
p: 'ₚ',
|
||
r: 'ᵣ',
|
||
s: 'ₛ',
|
||
t: 'ₜ',
|
||
u: 'ᵤ',
|
||
v: 'ᵥ',
|
||
x: 'ₓ'
|
||
}
|
||
|
||
// Sentinel control characters used to mark `\boxed` / `\fbox` regions in
|
||
// the converted output. The renderer splits on these to apply a highlight
|
||
// style; consumers that don't want highlighting can strip them with the
|
||
// exported `BOX_RE` below.
|
||
export const BOX_OPEN = '\u0001'
|
||
export const BOX_CLOSE = '\u0002'
|
||
export const BOX_RE = /\u0001([^\u0001\u0002]*)\u0002/g
|
||
|
||
const escapeRe = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
||
|
||
// Pre-compile two symbol regexes: one for letter-ending commands (`\pi`,
|
||
// `\sum`) which need a `(?![A-Za-z])` lookahead so they don't partially
|
||
// match `\pix` or `\summa`, and one for punctuation-ending commands
|
||
// (`\{`, `\,`, `\|`) which must NOT have the lookahead — otherwise
|
||
// `\{p` would refuse to substitute because `p` is a letter.
|
||
//
|
||
// Longest commands first inside each group so `\leq` beats `\le`.
|
||
const splitByEnding = (keys: string[]) => {
|
||
const letter: string[] = []
|
||
const punct: string[] = []
|
||
|
||
for (const k of keys) {
|
||
if (/[A-Za-z]$/.test(k)) {
|
||
letter.push(k)
|
||
} else {
|
||
punct.push(k)
|
||
}
|
||
}
|
||
|
||
return { letter, punct }
|
||
}
|
||
|
||
const buildAlt = (cmds: string[]) =>
|
||
cmds
|
||
.sort((a, b) => b.length - a.length)
|
||
.map(escapeRe)
|
||
.join('|')
|
||
|
||
const { letter: LETTER_CMDS, punct: PUNCT_CMDS } = splitByEnding(Object.keys(SYMBOLS))
|
||
|
||
const SYMBOL_LETTER_RE = new RegExp('(?:' + buildAlt(LETTER_CMDS) + ')(?![A-Za-z])', 'g')
|
||
const SYMBOL_PUNCT_RE = new RegExp('(?:' + buildAlt(PUNCT_CMDS) + ')', 'g')
|
||
|
||
const convertScript = (input: string, table: Record<string, string>, sigil: '^' | '_'): string => {
|
||
let out = ''
|
||
let allMapped = true
|
||
|
||
for (const ch of input) {
|
||
const mapped = table[ch]
|
||
|
||
if (!mapped) {
|
||
allMapped = false
|
||
|
||
break
|
||
}
|
||
|
||
out += mapped
|
||
}
|
||
|
||
if (allMapped) {
|
||
return out
|
||
}
|
||
|
||
// Fallback: if the body is a single visible character (e.g. `∞` after
|
||
// earlier symbol substitution), render it without braces — `^∞` reads
|
||
// far better than `^{∞}` in a terminal. Multi-char bodies that don't
|
||
// fully convert use parens (`e^(iπ)`) instead of braces (`e^{iπ}`)
|
||
// because parens are normal punctuation while braces look like
|
||
// unrendered LaTeX.
|
||
const trimmed = input.trim()
|
||
|
||
if ([...trimmed].length === 1) {
|
||
return `${sigil}${trimmed}`
|
||
}
|
||
|
||
return `${sigil}(${trimmed})`
|
||
}
|
||
|
||
// Walk the string and parse `{...}` honouring nested braces. Unlike a
|
||
// `\{[^{}]*\}` regex this survives `\frac{|t|^{p-1}|P(t)|^p}{...}` where
|
||
// the numerator contains its own braces from a superscript. Returns the
|
||
// inner content (without the outer braces) and the offset just past the
|
||
// closing `}`. Returns null if there is no balanced brace at `start`.
|
||
const readBraced = (s: string, start: number): { content: string; end: number } | null => {
|
||
if (s[start] !== '{') {
|
||
return null
|
||
}
|
||
|
||
let depth = 1
|
||
let i = start + 1
|
||
|
||
while (i < s.length && depth > 0) {
|
||
const c = s[i]
|
||
|
||
// Skip escapes — `\{` and `\}` inside a body are literal braces and
|
||
// should not change the brace counter.
|
||
if (c === '\\' && i + 1 < s.length) {
|
||
i += 2
|
||
continue
|
||
}
|
||
|
||
if (c === '{') {
|
||
depth++
|
||
} else if (c === '}') {
|
||
depth--
|
||
}
|
||
|
||
if (depth > 0) {
|
||
i++
|
||
}
|
||
}
|
||
|
||
if (depth !== 0) {
|
||
return null
|
||
}
|
||
|
||
return { content: s.slice(start + 1, i), end: i + 1 }
|
||
}
|
||
|
||
// Replace every occurrence of `\command{arg}` using balanced-brace parsing
|
||
// (so `\boxed{x^{n+1}}` works where a `[^{}]*` regex would fail). The
|
||
// `render` callback receives the inner content already recursed-into, so
|
||
// `\boxed{\boxed{x}}` resolves outside-in cleanly. Unmatched `\command`
|
||
// (no following `{...}`) is preserved verbatim.
|
||
const replaceBracedCommand = (input: string, command: string, render: (content: string) => string): string => {
|
||
const cmdLen = command.length
|
||
let out = ''
|
||
let i = 0
|
||
|
||
while (i < input.length) {
|
||
const idx = input.indexOf(command, i)
|
||
|
||
if (idx < 0) {
|
||
out += input.slice(i)
|
||
|
||
return out
|
||
}
|
||
|
||
const after = input[idx + cmdLen]
|
||
|
||
if (after && /[A-Za-z]/.test(after)) {
|
||
out += input.slice(i, idx + cmdLen)
|
||
i = idx + cmdLen
|
||
continue
|
||
}
|
||
|
||
out += input.slice(i, idx)
|
||
|
||
let p = idx + cmdLen
|
||
|
||
while (input[p] === ' ' || input[p] === '\t') p++
|
||
|
||
const arg = readBraced(input, p)
|
||
|
||
if (!arg) {
|
||
out += input.slice(idx, p + 1)
|
||
i = p + 1
|
||
continue
|
||
}
|
||
|
||
out += render(replaceBracedCommand(arg.content, command, render))
|
||
i = arg.end
|
||
}
|
||
|
||
return out
|
||
}
|
||
|
||
// Replace every `\frac{num}{den}` with `num/den` (parens around either
|
||
// side when its precedence demands it). The recursion handles nested
|
||
// fractions naturally: `\frac{1}{\frac{1}{x}}` collapses to `1/(1/x)`
|
||
// because we recurse into `den` before deciding whether to parenthesise.
|
||
const replaceFracs = (input: string): string => {
|
||
let out = ''
|
||
let i = 0
|
||
|
||
while (i < input.length) {
|
||
const idx = input.indexOf('\\frac', i)
|
||
|
||
if (idx < 0) {
|
||
out += input.slice(i)
|
||
|
||
return out
|
||
}
|
||
|
||
const after = input[idx + 5]
|
||
|
||
// `(?![A-Za-z])` — protect hypothetical commands like `\fraction`.
|
||
if (after && /[A-Za-z]/.test(after)) {
|
||
out += input.slice(i, idx + 5)
|
||
i = idx + 5
|
||
continue
|
||
}
|
||
|
||
out += input.slice(i, idx)
|
||
|
||
let p = idx + 5
|
||
|
||
while (input[p] === ' ' || input[p] === '\t') p++
|
||
|
||
const num = readBraced(input, p)
|
||
|
||
if (!num) {
|
||
out += input.slice(idx, p + 1)
|
||
i = p + 1
|
||
continue
|
||
}
|
||
|
||
p = num.end
|
||
|
||
while (input[p] === ' ' || input[p] === '\t') p++
|
||
|
||
const den = readBraced(input, p)
|
||
|
||
if (!den) {
|
||
out += input.slice(idx, p + 1)
|
||
i = p + 1
|
||
continue
|
||
}
|
||
|
||
out += `${wrapForFrac(replaceFracs(num.content))}/${wrapForFrac(replaceFracs(den.content))}`
|
||
i = den.end
|
||
}
|
||
|
||
return out
|
||
}
|
||
|
||
// Wrap multi-token expressions in parens so `\frac{a+b}{c}` becomes
|
||
// `(a+b)/c` rather than `a+b/c`. We wrap whenever inline `/` would
|
||
// change the meaning — that's any binary operator (`+`, `-`, `*`, `/`)
|
||
// or whitespace separating tokens. `*` and `/` matter because nested
|
||
// fractions and products like `\frac{a*b}{c}` and `\frac{1/x}{y}` would
|
||
// otherwise read as `a*b/c` (right-associative ambiguity) and `1/x/y`.
|
||
// Atomic factors like `n!`, `x^2`, `\sin x` don't trigger any of these
|
||
// and stay un-parenthesised — wrapping them just clutters the output.
|
||
const wrapForFrac = (expr: string) => {
|
||
const trimmed = expr.trim()
|
||
|
||
if (!trimmed) {
|
||
return trimmed
|
||
}
|
||
|
||
if (/^\(.*\)$/.test(trimmed)) {
|
||
return trimmed
|
||
}
|
||
|
||
if (/[+\-/*]|\s/.test(trimmed)) {
|
||
return `(${trimmed})`
|
||
}
|
||
|
||
return trimmed
|
||
}
|
||
|
||
export function texToUnicode(input: string): string {
|
||
let s = input
|
||
|
||
s = s.replace(/\\mathbb\s*\{([A-Za-z])\}/g, (raw, c: string) => BB[c] ?? raw)
|
||
s = s.replace(/\\mathcal\s*\{([A-Za-z])\}/g, (raw, c: string) => CAL[c] ?? raw)
|
||
s = s.replace(/\\mathfrak\s*\{([A-Za-z])\}/g, (raw, c: string) => FRAK[c] ?? raw)
|
||
s = s.replace(/\\mathbf\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||
s = s.replace(/\\mathit\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||
s = s.replace(/\\mathrm\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||
s = s.replace(/\\text\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||
s = s.replace(/\\operatorname\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||
|
||
s = s.replace(/\\overline\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0305`)
|
||
s = s.replace(/\\hat\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0302`)
|
||
s = s.replace(/\\bar\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0304`)
|
||
s = s.replace(/\\tilde\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0303`)
|
||
s = s.replace(/\\vec\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u20D7`)
|
||
s = s.replace(/\\dot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0307`)
|
||
s = s.replace(/\\ddot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0308`)
|
||
|
||
s = replaceFracs(s)
|
||
|
||
// `\boxed{X}` / `\fbox{X}` highlight a final answer. Terminals can't
|
||
// draw a real box, so we wrap the content in U+0001 / U+0002 control
|
||
// characters — non-printable, never present in real text — and let the
|
||
// markdown renderer split on them and apply a highlight style (inverse
|
||
// video) to the bracketed region. This keeps `texToUnicode` pure-string
|
||
// while letting the React layer do the actual visual emphasis.
|
||
// Argument is parsed with balanced braces so nested `{...}` from
|
||
// superscripts / fractions inside the box survive.
|
||
s = replaceBracedCommand(s, '\\boxed', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`)
|
||
s = replaceBracedCommand(s, '\\fbox', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`)
|
||
|
||
// `\xrightarrow{label}` / `\xleftarrow{label}` collapse to an arrow with
|
||
// the label inline. LaTeX renders the label above the arrow; in monospace
|
||
// we put it adjacent — `─label→` is the closest readable approximation.
|
||
// Run before the symbol pass so the label can still pick up Greek and
|
||
// operator substitutions afterwards.
|
||
s = s.replace(/\\xrightarrow\s*\{([^{}]*)\}/g, (_, label: string) => `─${label.trim()}→`)
|
||
s = s.replace(/\\xleftarrow\s*\{([^{}]*)\}/g, (_, label: string) => `←${label.trim()}─`)
|
||
s = s.replace(/\\Longrightarrow/g, '⟹')
|
||
s = s.replace(/\\Longleftarrow/g, '⟸')
|
||
s = s.replace(/\\Longleftrightarrow/g, '⟺')
|
||
|
||
// `\pmod{p}` → ` (mod p)` (LaTeX adds parens automatically); `\pod{p}`
|
||
// is a paren-less variant; `\tag{n}` is the equation-number annotation
|
||
// shown to the right of an equation. Collapse to a single-space-prefixed
|
||
// bracketed form. The leading `\s*` in the pattern absorbs any whitespace
|
||
// already in the source so we don't end up with `b (mod p)` (double
|
||
// space) when the user wrote `b \pmod{p}`.
|
||
s = s.replace(/\s*\\pmod\s*\{([^{}]*)\}/g, (_, p: string) => ` (mod ${p.trim()})`)
|
||
s = s.replace(/\s*\\pod\s*\{([^{}]*)\}/g, (_, p: string) => ` (${p.trim()})`)
|
||
s = s.replace(/\s*\\tag\s*\{([^{}]*)\}/g, (_, n: string) => ` (${n.trim()})`)
|
||
|
||
// `\big`, `\Big`, `\bigg`, `\Bigg` (with optional `l`/`r`/`m` suffix)
|
||
// are sizing wrappers analogous to `\left`/`\right` but without the
|
||
// automatic-pairing semantics. Strip them and leave whatever delimiter
|
||
// follows. The trailing `(?![A-Za-z])` protects `\bigtriangleup` and
|
||
// any other letter-continuation command from being shaved.
|
||
s = s.replace(/\\(?:Bigg|bigg|Big|big)[lrm]?(?![A-Za-z])/g, '')
|
||
|
||
// Style / size hints that don't typeset any glyph and only affect how
|
||
// things would be sized in a real LaTeX engine. In a terminal every
|
||
// glyph is one monospace cell, so there's nothing to do — drop them
|
||
// (with any trailing whitespace) so they don't leak through as raw
|
||
// `\displaystyle` in the output.
|
||
s = s.replace(/\\(?:scriptscriptstyle|displaystyle|scriptstyle|textstyle|nolimits|limits)(?![A-Za-z])\s*/g, '')
|
||
|
||
// `\left` and `\right` are sizing wrappers around any delimiter — bare
|
||
// (`\left(`), escaped (`\left\{`), or named (`\left\langle`). Strip the
|
||
// wrapper unconditionally and let the rest of the pipeline (or the
|
||
// upcoming symbol pass) handle whatever delimiter follows. The optional
|
||
// `.?` consumes `\left.` / `\right.` which mean "no delimiter".
|
||
// Lookahead `(?![A-Za-z])` keeps `\leftarrow` / `\leftrightarrow` safe.
|
||
s = s.replace(/\\left(?![A-Za-z])\.?/g, '')
|
||
s = s.replace(/\\right(?![A-Za-z])\.?/g, '')
|
||
|
||
// Run symbol substitution BEFORE scripts so a body like `^{\infty}`
|
||
// becomes `^{∞}` first; convertScript can then either map ∞ to a
|
||
// superscript (it can't — Unicode lacks one) or fall back to `^∞`
|
||
// by stripping braces around the now-single-character body.
|
||
//
|
||
// Punctuation pass first — these can be followed by letters (`\{p`
|
||
// is "open-brace then p"), so the letter pass's `(?![A-Za-z])` rule
|
||
// would wrongly block them.
|
||
s = s.replace(SYMBOL_PUNCT_RE, m => SYMBOLS[m] ?? m)
|
||
s = s.replace(SYMBOL_LETTER_RE, m => SYMBOLS[m] ?? m)
|
||
|
||
// Bare `^c` / `_c` handles ONLY alphanumerics and `+`/`-`/`=`. Parens
|
||
// are intentionally excluded because the braced-fallback above can
|
||
// emit `(...)` and we don't want a second pass to greedily convert
|
||
// its opening paren into `⁽` and orphan the closing one.
|
||
s = s.replace(/\^\s*\{([^{}]+)\}/g, (_, body: string) => convertScript(body, SUPERSCRIPT, '^'))
|
||
s = s.replace(/\^([A-Za-z0-9+\-=])/g, (raw, ch: string) => SUPERSCRIPT[ch] ?? raw)
|
||
s = s.replace(/_\s*\{([^{}]+)\}/g, (_, body: string) => convertScript(body, SUBSCRIPT, '_'))
|
||
s = s.replace(/_([A-Za-z0-9+\-=])/g, (raw, ch: string) => SUBSCRIPT[ch] ?? raw)
|
||
|
||
return s
|
||
}
|