mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-07 02:51:50 +00:00
Merge pull request #17175 from NousResearch/fix/markdown
feat(latex): latex in tui
This commit is contained in:
commit
430302c197
7 changed files with 1391 additions and 50 deletions
41
ui-tui/package-lock.json
generated
41
ui-tui/package-lock.json
generated
|
|
@ -124,6 +124,7 @@
|
|||
"integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@babel/code-frame": "^7.29.0",
|
||||
"@babel/generator": "^7.29.0",
|
||||
|
|
@ -501,31 +502,6 @@
|
|||
"node": ">=6.9.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/core": {
|
||||
"version": "1.10.0",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz",
|
||||
"integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@emnapi/wasi-threads": "1.2.1",
|
||||
"tslib": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/runtime": {
|
||||
"version": "1.10.0",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz",
|
||||
"integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"optional": true,
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"tslib": "^2.4.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@emnapi/wasi-threads": {
|
||||
"version": "1.2.1",
|
||||
"resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz",
|
||||
|
|
@ -1700,6 +1676,7 @@
|
|||
"integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"undici-types": "~7.19.0"
|
||||
}
|
||||
|
|
@ -1710,6 +1687,7 @@
|
|||
"integrity": "sha512-ilcTH/UniCkMdtexkoCN0bI7pMcJDvmQFPvuPvmEaYA/NSfFTAgdUSLAoVjaRJm7+6PvcM+q1zYOwS4wTYMF9w==",
|
||||
"devOptional": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"csstype": "^3.2.2"
|
||||
}
|
||||
|
|
@ -1720,6 +1698,7 @@
|
|||
"integrity": "sha512-eSkwoemjo76bdXl2MYqtxg51HNwUSkWfODUOQ3PaTLZGh9uIWWFZIjyjaJnex7wXDu+TRx+ATsnSxdN9YWfRTQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@eslint-community/regexpp": "^4.12.2",
|
||||
"@typescript-eslint/scope-manager": "8.58.1",
|
||||
|
|
@ -1749,6 +1728,7 @@
|
|||
"integrity": "sha512-gGkiNMPqerb2cJSVcruigx9eHBlLG14fSdPdqMoOcBfh+vvn4iCq2C8MzUB89PrxOXk0y3GZ1yIWb9aOzL93bw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@typescript-eslint/scope-manager": "8.58.1",
|
||||
"@typescript-eslint/types": "8.58.1",
|
||||
|
|
@ -2066,6 +2046,7 @@
|
|||
"integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"acorn": "bin/acorn"
|
||||
},
|
||||
|
|
@ -2468,6 +2449,7 @@
|
|||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"baseline-browser-mapping": "^2.10.12",
|
||||
"caniuse-lite": "^1.0.30001782",
|
||||
|
|
@ -3203,6 +3185,7 @@
|
|||
"integrity": "sha512-XoMjdBOwe/esVgEvLmNsD3IRHkm7fbKIUGvrleloJXUZgDHig2IPWNniv+GwjyJXzuNqVjlr5+4yVUZjycJwfQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"@eslint-community/eslint-utils": "^4.8.0",
|
||||
"@eslint-community/regexpp": "^4.12.1",
|
||||
|
|
@ -3334,6 +3317,7 @@
|
|||
"integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
|
|
@ -4242,6 +4226,7 @@
|
|||
"resolved": "https://registry.npmjs.org/ink-text-input/-/ink-text-input-6.0.0.tgz",
|
||||
"integrity": "sha512-Fw64n7Yha5deb1rHY137zHTAbSTNelUKuB5Kkk2HACXEtwIHBCf9OH2tP/LQ9fRYTl1F0dZgbW0zPnZk6FA9Lw==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"chalk": "^5.3.0",
|
||||
"type-fest": "^4.18.2"
|
||||
|
|
@ -5678,6 +5663,7 @@
|
|||
"integrity": "sha512-QP88BAKvMam/3NxH6vj2o21R6MjxZUAd6nlwAS/pnGvN9IVLocLHxGYIzFhg6fUQ+5th6P4dv4eW9jX3DSIj7A==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
|
|
@ -5787,6 +5773,7 @@
|
|||
"resolved": "https://registry.npmjs.org/react/-/react-19.2.5.tgz",
|
||||
"integrity": "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA==",
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"engines": {
|
||||
"node": ">=0.10.0"
|
||||
}
|
||||
|
|
@ -6611,6 +6598,7 @@
|
|||
"integrity": "sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"esbuild": "~0.27.0",
|
||||
"get-tsconfig": "^4.7.5"
|
||||
|
|
@ -6737,6 +6725,7 @@
|
|||
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"peer": true,
|
||||
"bin": {
|
||||
"tsc": "bin/tsc",
|
||||
"tsserver": "bin/tsserver"
|
||||
|
|
@ -6846,6 +6835,7 @@
|
|||
"integrity": "sha512-dbU7/iLVa8KZALJyLOBOQ88nOXtNG8vxKuOT4I2mD+Ya70KPceF4IAmDsmU0h1Qsn5bPrvsY9HJstCRh3hG6Uw==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"dependencies": {
|
||||
"lightningcss": "^1.32.0",
|
||||
"picomatch": "^4.0.4",
|
||||
|
|
@ -7261,6 +7251,7 @@
|
|||
"integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"peer": true,
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/colinhacks"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -61,6 +61,66 @@ describe('stripInlineMarkup', () => {
|
|||
expect(stripInlineMarkup('Yay ~! nice work ~!')).toBe('Yay ~! nice work ~!')
|
||||
expect(stripInlineMarkup('H~2~O and CO~2~')).toBe('H_2O and CO_2')
|
||||
})
|
||||
|
||||
it('strips inline math delimiters but keeps the formula text', () => {
|
||||
expect(stripInlineMarkup('$\\mathbb{Z}$ is a ring')).toBe('\\mathbb{Z} is a ring')
|
||||
expect(stripInlineMarkup('see \\(a + b\\) ok')).toBe('see a + b ok')
|
||||
})
|
||||
})
|
||||
|
||||
describe('INLINE_RE inline math', () => {
|
||||
it('matches single-dollar math and beats emphasis at the same start', () => {
|
||||
// Without math handling, `*b*` would have matched as italics and
|
||||
// corrupted the formula. With math added to INLINE_RE, the leftmost
|
||||
// match at column 0 (`$P=a*b*c$`) wins.
|
||||
expect(matches('$P=a*b*c$')).toEqual(['$P=a*b*c$'])
|
||||
expect(matches('see $\\mathbb{Z}$ here')).toEqual(['$\\mathbb{Z}$'])
|
||||
})
|
||||
|
||||
it('does not match currency-style prose', () => {
|
||||
expect(matches('it costs $5 and $10')).toEqual([])
|
||||
expect(matches('paid $5')).toEqual([])
|
||||
})
|
||||
|
||||
it('does not let inline math swallow a $$ display fence', () => {
|
||||
// `$$x$$` is a display block, not two abutting inline-math spans.
|
||||
expect(matches('$$x$$')).toEqual([])
|
||||
})
|
||||
|
||||
it('matches \\(...\\) inline math', () => {
|
||||
expect(matches('foo \\(x + y\\) bar')).toEqual(['\\(x + y\\)'])
|
||||
})
|
||||
|
||||
it('does not corrupt subscripts/superscripts inside math', () => {
|
||||
// `_n` and `^r` are markdown emphasis/superscript markers in prose, but
|
||||
// inside a `$...$` span the entire formula is captured as a single
|
||||
// inline-math token so the inner regexes never see those characters.
|
||||
expect(matches('$P=a_n x^n + a_0$')).toEqual(['$P=a_n x^n + a_0$'])
|
||||
expect(matches('$\\beta_1,\\dots,\\beta_r$')).toEqual(['$\\beta_1,\\dots,\\beta_r$'])
|
||||
})
|
||||
|
||||
it('places math content in the correct capture group (regression: m[16] is bare URL)', () => {
|
||||
// When `m[16]` was the bare URL group AND the inline-math `$...$`
|
||||
// group simultaneously (because the bare URL pattern lacked its own
|
||||
// capturing parens), MdInline rendered `$\\mathbb{R}$` as an
|
||||
// underlined autolink instead of italic amber math. Lock down the
|
||||
// numbering: math goes in m[17] / m[18], URLs go in m[16].
|
||||
const url = [...'see https://example.com here'.matchAll(INLINE_RE)][0]!
|
||||
const dollarMath = [...'$\\mathbb{R}$'.matchAll(INLINE_RE)][0]!
|
||||
const parenMath = [...'\\(\\pi\\)'.matchAll(INLINE_RE)][0]!
|
||||
|
||||
expect(url[16]).toBe('https://example.com')
|
||||
expect(url[17]).toBeUndefined()
|
||||
expect(url[18]).toBeUndefined()
|
||||
|
||||
expect(dollarMath[16]).toBeUndefined()
|
||||
expect(dollarMath[17]).toBe('\\mathbb{R}')
|
||||
expect(dollarMath[18]).toBeUndefined()
|
||||
|
||||
expect(parenMath[16]).toBeUndefined()
|
||||
expect(parenMath[17]).toBeUndefined()
|
||||
expect(parenMath[18]).toBe('\\pi')
|
||||
})
|
||||
})
|
||||
|
||||
describe('protocol sentinels', () => {
|
||||
|
|
|
|||
293
ui-tui/src/__tests__/mathUnicode.test.ts
Normal file
293
ui-tui/src/__tests__/mathUnicode.test.ts
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
import { describe, expect, it } from 'vitest'
|
||||
|
||||
import { BOX_CLOSE, BOX_OPEN, BOX_RE, texToUnicode } from '../lib/mathUnicode.js'
|
||||
|
||||
const stripBox = (s: string) => s.replace(BOX_RE, '$1')
|
||||
|
||||
describe('texToUnicode — symbols', () => {
|
||||
it('substitutes lowercase Greek', () => {
|
||||
expect(texToUnicode('\\alpha + \\beta + \\pi')).toBe('α + β + π')
|
||||
expect(texToUnicode('\\omega')).toBe('ω')
|
||||
})
|
||||
|
||||
it('substitutes uppercase Greek', () => {
|
||||
expect(texToUnicode('\\Sigma \\Omega \\Pi')).toBe('Σ Ω Π')
|
||||
})
|
||||
|
||||
it('substitutes set theory and logic operators', () => {
|
||||
expect(texToUnicode('A \\cup B \\cap C')).toBe('A ∪ B ∩ C')
|
||||
expect(texToUnicode('\\forall x \\in \\emptyset')).toBe('∀ x ∈ ∅')
|
||||
expect(texToUnicode('p \\implies q \\iff r')).toBe('p ⟹ q ⟺ r')
|
||||
})
|
||||
|
||||
it('substitutes relations and arrows', () => {
|
||||
expect(texToUnicode('a \\le b \\ge c \\ne d')).toBe('a ≤ b ≥ c ≠ d')
|
||||
expect(texToUnicode('f: A \\to B')).toBe('f: A → B')
|
||||
})
|
||||
|
||||
it('uses longest-match-first so \\leq beats \\le', () => {
|
||||
expect(texToUnicode('\\leq')).toBe('≤')
|
||||
})
|
||||
|
||||
it('preserves unknown commands that share a prefix with known ones', () => {
|
||||
// `\leqq` is a real LaTeX command (≦) we don't have in our table.
|
||||
// The word-boundary lookahead prevents `\le` from matching, so the
|
||||
// whole thing is preserved verbatim — much better than `≤qq`.
|
||||
expect(texToUnicode('\\leqq')).toBe('\\leqq')
|
||||
})
|
||||
|
||||
it('refuses to substitute a partial command (word boundary)', () => {
|
||||
expect(texToUnicode('\\alphabet')).toBe('\\alphabet')
|
||||
expect(texToUnicode('\\pin')).toBe('\\pin')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — blackboard / calligraphic / fraktur', () => {
|
||||
it('renders \\mathbb capitals', () => {
|
||||
expect(texToUnicode('\\mathbb{R}')).toBe('ℝ')
|
||||
expect(texToUnicode('\\mathbb{N} \\subset \\mathbb{Z} \\subset \\mathbb{Q} \\subset \\mathbb{R}')).toBe('ℕ ⊂ ℤ ⊂ ℚ ⊂ ℝ')
|
||||
})
|
||||
|
||||
it('renders \\mathcal and \\mathfrak', () => {
|
||||
expect(texToUnicode('\\mathcal{F} \\subset \\mathfrak{A}')).toBe('ℱ ⊂ 𝔄')
|
||||
})
|
||||
|
||||
it('preserves \\mathbb{...} when argument is multi-letter or non-letter', () => {
|
||||
expect(texToUnicode('\\mathbb{NN}')).toBe('\\mathbb{NN}')
|
||||
expect(texToUnicode('\\mathbb{1}')).toBe('\\mathbb{1}')
|
||||
})
|
||||
|
||||
it('strips \\mathbf / \\mathit / \\mathrm / \\text wrappers (no Unicode bold/italic in monospace)', () => {
|
||||
expect(texToUnicode('\\mathbf{x}')).toBe('x')
|
||||
expect(texToUnicode('\\text{if } x > 0')).toBe('if x > 0')
|
||||
expect(texToUnicode('\\operatorname{rank}(A)')).toBe('rank(A)')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — sub / superscripts', () => {
|
||||
it('converts simple superscripts', () => {
|
||||
expect(texToUnicode('x^2 + y^2')).toBe('x² + y²')
|
||||
expect(texToUnicode('e^{n}')).toBe('eⁿ')
|
||||
})
|
||||
|
||||
it('converts simple subscripts', () => {
|
||||
expect(texToUnicode('a_1 + a_2 + a_n')).toBe('a₁ + a₂ + aₙ')
|
||||
expect(texToUnicode('x_{0}')).toBe('x₀')
|
||||
})
|
||||
|
||||
it('converts mixed-content scripts when every glyph has a Unicode form', () => {
|
||||
// `+`, digits, and lowercase letters all have superscript glyphs,
|
||||
// so `n+1` → `ⁿ⁺¹`. Comma has no subscript form, so `i,j` falls
|
||||
// back to `_(i,j)` (parens) rather than partially substituting —
|
||||
// parens read as ordinary grouping while braces look like leftover
|
||||
// unrendered LaTeX.
|
||||
expect(texToUnicode('x^{n+1}')).toBe('xⁿ⁺¹')
|
||||
expect(texToUnicode('a_{i,j}')).toBe('a_(i,j)')
|
||||
})
|
||||
|
||||
it('uses parens (not braces) when the body has Greek with no superscript form', () => {
|
||||
// π has no Unicode superscript, so `e^{i\pi}` after symbol pass is
|
||||
// `e^{iπ}` and the script fallback emits `e^(iπ)` — much more
|
||||
// readable than the LaTeX-looking `e^{iπ}`.
|
||||
expect(texToUnicode('e^{i\\pi}')).toBe('e^(iπ)')
|
||||
})
|
||||
|
||||
it('strips braces on script fallback when body collapses to a single char', () => {
|
||||
// `^{\infty}` → symbol pass produces `^{∞}` → convertScript can't
|
||||
// find ∞ in SUPERSCRIPT, but the body is one char so we drop the
|
||||
// braces and emit `^∞` (much more readable than `^{∞}`).
|
||||
expect(texToUnicode('e^{\\infty}')).toBe('e^∞')
|
||||
})
|
||||
|
||||
it('handles a real-world sum', () => {
|
||||
expect(texToUnicode('\\sum_{n=0}^{\\infty} \\frac{1}{n!}')).toBe('∑ₙ₌₀^∞ 1/n!')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — fractions', () => {
|
||||
it('collapses \\frac to a/b', () => {
|
||||
expect(texToUnicode('\\frac{1}{2}')).toBe('1/2')
|
||||
expect(texToUnicode('\\frac{a}{b}')).toBe('a/b')
|
||||
})
|
||||
|
||||
it('parenthesises multi-token numerator / denominator', () => {
|
||||
expect(texToUnicode('\\frac{n+1}{2}')).toBe('(n+1)/2')
|
||||
expect(texToUnicode('\\frac{a + b}{c - d}')).toBe('(a + b)/(c - d)')
|
||||
})
|
||||
|
||||
it('handles nested fractions', () => {
|
||||
expect(texToUnicode('\\frac{1}{\\frac{1}{x}}')).toBe('1/(1/x)')
|
||||
})
|
||||
|
||||
it('handles braces inside numerator / denominator (regression: regex \\frac couldn\'t)', () => {
|
||||
// The regex-only `\frac` matcher used `[^{}]*` for each arg, which
|
||||
// failed the moment a numerator contained its own braces (here the
|
||||
// `{p-1}` from a superscript). The balanced-brace parser handles it.
|
||||
expect(texToUnicode('\\frac{|t|^{p-1}|P(t)|^p}{(p-1)!}')).toBe('(|t|ᵖ⁻¹|P(t)|ᵖ)/((p-1)!)')
|
||||
})
|
||||
|
||||
it('preserves \\frac when arguments are malformed', () => {
|
||||
expect(texToUnicode('\\frac{a}')).toBe('\\frac{a}')
|
||||
expect(texToUnicode('\\fraction{a}{b}')).toBe('\\fraction{a}{b}')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — typography no-ops', () => {
|
||||
it('strips \\displaystyle / \\textstyle / \\scriptstyle / \\scriptscriptstyle', () => {
|
||||
expect(texToUnicode('\\displaystyle\\sum_{i=1}^n x_i')).toBe('∑ᵢ₌₁ⁿ xᵢ')
|
||||
expect(texToUnicode('f(x) = \\displaystyle \\frac{1}{2}')).toBe('f(x) = 1/2')
|
||||
expect(texToUnicode('\\textstyle x + y')).toBe('x + y')
|
||||
})
|
||||
|
||||
it('strips \\limits / \\nolimits which only affect bound positioning', () => {
|
||||
expect(texToUnicode('\\sum\\limits_{k=1}^n a_k')).toBe('∑ₖ₌₁ⁿ aₖ')
|
||||
expect(texToUnicode('\\int\\nolimits_0^1 f(x) dx')).toBe('∫₀¹ f(x) dx')
|
||||
})
|
||||
|
||||
it('does not eat letter-continuation commands like \\limit_inf', () => {
|
||||
// The `(?![A-Za-z])` lookahead protects hypothetical commands that
|
||||
// start with `\limit` / `\display` / etc. The bare names are stripped
|
||||
// but anything longer is preserved verbatim.
|
||||
expect(texToUnicode('\\limitinf x')).toBe('\\limitinf x')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — sizing wrappers', () => {
|
||||
it('strips \\big / \\Big / \\bigg / \\Bigg before delimiters', () => {
|
||||
expect(texToUnicode('\\bigl[ x \\bigr]')).toBe('[ x ]')
|
||||
expect(texToUnicode('\\Big( y \\Big)')).toBe('( y )')
|
||||
expect(texToUnicode('\\bigg| z \\bigg|')).toBe('| z |')
|
||||
expect(texToUnicode('\\Biggl\\{ a \\Biggr\\}')).toBe('{ a }')
|
||||
})
|
||||
|
||||
it('does not eat \\bigtriangleup or other letter-continuations', () => {
|
||||
expect(texToUnicode('A \\bigtriangleup B')).toBe('A \\bigtriangleup B')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — modular arithmetic and tags', () => {
|
||||
it('renders \\pmod{p} as " (mod p)"', () => {
|
||||
expect(texToUnicode('a \\equiv b \\pmod{p}')).toBe('a ≡ b (mod p)')
|
||||
})
|
||||
|
||||
it('renders \\bmod / \\mod inline', () => {
|
||||
expect(texToUnicode('a \\bmod n')).toBe('a mod n')
|
||||
})
|
||||
|
||||
it('collapses \\tag{n} to " (n)"', () => {
|
||||
expect(texToUnicode('x = y \\tag{24}')).toBe('x = y (24)')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — newly added symbols', () => {
|
||||
it('renders \\nmid, \\blacksquare, \\qed', () => {
|
||||
expect(texToUnicode('p \\nmid q')).toBe('p ∤ q')
|
||||
expect(texToUnicode('Therefore \\blacksquare')).toBe('Therefore ■')
|
||||
expect(texToUnicode('done \\qed')).toBe('done ∎')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — \\boxed / \\fbox', () => {
|
||||
// `\boxed` produces non-printable U+0001 / U+0002 sentinels around its
|
||||
// content so the markdown renderer can apply highlight styling. These
|
||||
// tests assert both the sentinel form and the human-readable
|
||||
// strip-fallback (BOX_RE).
|
||||
it('wraps simple boxed content in BOX_OPEN/BOX_CLOSE sentinels', () => {
|
||||
expect(texToUnicode('\\boxed{x = 0}')).toBe(`${BOX_OPEN}x = 0${BOX_CLOSE}`)
|
||||
expect(stripBox(texToUnicode('\\boxed{x = 0}'))).toBe('x = 0')
|
||||
expect(stripBox(texToUnicode('\\fbox{answer}'))).toBe('answer')
|
||||
})
|
||||
|
||||
it('handles boxed expressions with nested braces (regression: regex couldn\'t)', () => {
|
||||
// A `[^{}]*` regex would stop at the first `{` inside the body. The
|
||||
// balanced-brace parser walks past it.
|
||||
expect(stripBox(texToUnicode('\\boxed{x^{n+1}}'))).toBe('xⁿ⁺¹')
|
||||
expect(stripBox(texToUnicode('\\boxed{\\frac{a}{b}}'))).toBe('a/b')
|
||||
})
|
||||
|
||||
it('handles real-world boxed final answer', () => {
|
||||
expect(stripBox(texToUnicode('\\boxed{J = -\\sum_{k=0}^n a_k F(k)}'))).toBe('J = -∑ₖ₌₀ⁿ aₖ F(k)')
|
||||
})
|
||||
|
||||
it('preserves \\boxed without a brace argument', () => {
|
||||
expect(texToUnicode('\\boxed something')).toBe('\\boxed something')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — combining marks', () => {
|
||||
it('applies \\overline / \\bar / \\hat / \\vec / \\tilde', () => {
|
||||
expect(texToUnicode('\\overline{x}')).toBe('x\u0305')
|
||||
expect(texToUnicode('\\hat{y}')).toBe('y\u0302')
|
||||
expect(texToUnicode('\\vec{v}')).toBe('v\u20D7')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — left/right delimiters', () => {
|
||||
it('strips \\left and \\right keeping the delimiter character', () => {
|
||||
expect(texToUnicode('\\left( x + y \\right)')).toBe('( x + y )')
|
||||
expect(texToUnicode('\\left| x \\right|')).toBe('| x |')
|
||||
})
|
||||
|
||||
it('handles escaped delimiters \\left\\{ ... \\right\\}', () => {
|
||||
expect(texToUnicode('\\left\\{p/q \\mid q \\neq 0\\right\\}')).toBe('{p/q ∣ q ≠ 0}')
|
||||
})
|
||||
|
||||
it('handles named delimiters via \\left\\langle / \\right\\rangle', () => {
|
||||
expect(texToUnicode('\\left\\langle u, v \\right\\rangle')).toBe('⟨ u, v ⟩')
|
||||
})
|
||||
|
||||
it('drops \\left. and \\right. (which are explicit "no delimiter")', () => {
|
||||
expect(texToUnicode('\\left. f \\right|')).toBe(' f |')
|
||||
})
|
||||
|
||||
it('preserves \\leftarrow / \\rightarrow (word boundary blocks the strip)', () => {
|
||||
expect(texToUnicode('A \\leftarrow B \\rightarrow C')).toBe('A ← B → C')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — labelled arrows', () => {
|
||||
it('renders \\xrightarrow{label} as ─label→', () => {
|
||||
expect(texToUnicode('a \\xrightarrow{x=1} b')).toBe('a ─x=1→ b')
|
||||
})
|
||||
|
||||
it('renders \\xleftarrow{label} as ←label─', () => {
|
||||
expect(texToUnicode('a \\xleftarrow{n} b')).toBe('a ←n─ b')
|
||||
})
|
||||
|
||||
it('still applies symbol substitution inside the label', () => {
|
||||
expect(texToUnicode('a \\xrightarrow{n \\to \\infty} L')).toBe('a ─n → ∞→ L')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — punctuation commands without lookahead', () => {
|
||||
it('substitutes \\{ even when immediately followed by a letter', () => {
|
||||
// Regression: with a global `(?![A-Za-z])` lookahead, `\{p` refused
|
||||
// to substitute (because `p` is a letter) and rendered as `\{p`.
|
||||
expect(texToUnicode('\\{p, q\\}')).toBe('{p, q}')
|
||||
})
|
||||
|
||||
it('substitutes thin-space \\, before a letter', () => {
|
||||
expect(texToUnicode('a\\,b')).toBe('a b')
|
||||
})
|
||||
})
|
||||
|
||||
describe('texToUnicode — round-trip realism', () => {
|
||||
it('renders a typical model-emitted formula', () => {
|
||||
expect(texToUnicode('\\alpha \\in \\mathbb{R}, \\alpha \\notin \\mathbb{Q}')).toBe('α ∈ ℝ, α ∉ ℚ')
|
||||
})
|
||||
|
||||
it('preserves unknown commands verbatim', () => {
|
||||
expect(texToUnicode('\\bigtriangleup \\circledast')).toBe('\\bigtriangleup \\circledast')
|
||||
})
|
||||
|
||||
it('handles commands without delimiters between', () => {
|
||||
// Word-boundary lookahead means `\alpha\beta` doesn't accidentally
|
||||
// match `\alphabeta` as one ungrouped token.
|
||||
expect(texToUnicode('\\alpha\\beta')).toBe('αβ')
|
||||
})
|
||||
|
||||
it('leaves plain text alone', () => {
|
||||
expect(texToUnicode('hello world')).toBe('hello world')
|
||||
expect(texToUnicode('')).toBe('')
|
||||
})
|
||||
})
|
||||
|
|
@ -67,6 +67,48 @@ describe('findStableBoundary', () => {
|
|||
it('handles empty input', () => {
|
||||
expect(findStableBoundary('')).toBe(-1)
|
||||
})
|
||||
|
||||
it('refuses to split inside an open $$ math block', () => {
|
||||
// Display math has been opened but not closed; the only blank line
|
||||
// sits inside the open block, so there's no safe boundary yet.
|
||||
const text = '$$\nx + y\n\nmore math'
|
||||
|
||||
expect(findStableBoundary(text)).toBe(-1)
|
||||
})
|
||||
|
||||
it('allows splitting after a $$ math block closes', () => {
|
||||
const text = '$$\nx + y = z\n$$\n\nnarration continues'
|
||||
const idx = findStableBoundary(text)
|
||||
|
||||
expect(text.slice(0, idx)).toBe('$$\nx + y = z\n$$\n\n')
|
||||
expect(text.slice(idx)).toBe('narration continues')
|
||||
})
|
||||
|
||||
it('splits before an open $$ block but not inside', () => {
|
||||
// Mirror of the existing fenced-code test: prose, then an unclosed
|
||||
// math block. The only safe boundary is the blank line BEFORE `$$`.
|
||||
const text = 'intro paragraph\n\n$$\nx + y\n\nmore'
|
||||
const idx = findStableBoundary(text)
|
||||
|
||||
expect(text.slice(0, idx)).toBe('intro paragraph\n\n')
|
||||
expect(text.slice(idx).startsWith('$$')).toBe(true)
|
||||
})
|
||||
|
||||
it('treats single-line $$x$$ as zero net toggle', () => {
|
||||
// `$$x = y$$` opens AND closes on one line, so the stable boundary
|
||||
// after it is allowed.
|
||||
const text = 'intro\n\n$$x = y$$\n\nnarration'
|
||||
const idx = findStableBoundary(text)
|
||||
|
||||
expect(text.slice(0, idx)).toBe('intro\n\n$$x = y$$\n\n')
|
||||
expect(text.slice(idx)).toBe('narration')
|
||||
})
|
||||
|
||||
it('refuses to split inside an open \\[ math block', () => {
|
||||
const text = '\\[\nx + y\n\nmore'
|
||||
|
||||
expect(findStableBoundary(text)).toBe(-1)
|
||||
})
|
||||
})
|
||||
|
||||
describe('streaming theme assumption', () => {
|
||||
|
|
|
|||
|
|
@ -2,9 +2,60 @@ import { Box, Link, Text } from '@hermes/ink'
|
|||
import { Fragment, memo, type ReactNode, useMemo } from 'react'
|
||||
|
||||
import { ensureEmojiPresentation } from '../lib/emoji.js'
|
||||
import { BOX_CLOSE, BOX_OPEN, texToUnicode } from '../lib/mathUnicode.js'
|
||||
import { highlightLine, isHighlightable } from '../lib/syntax.js'
|
||||
import type { Theme } from '../theme.js'
|
||||
|
||||
// `\boxed{X}` regions in `texToUnicode` output are marked with the
|
||||
// non-printable U+0001 / U+0002 sentinels. Split on them and render the
|
||||
// boxed segment with `inverse + bold` so it reads as a highlighter-pen
|
||||
// emphasis on top of whatever color the parent `<Text>` is using (the
|
||||
// theme accent for math). The leading / trailing space inside the
|
||||
// highlight gives a one-cell visual margin so the highlight reads as a
|
||||
// block, not a hug.
|
||||
const renderMath = (text: string): ReactNode => {
|
||||
if (!text.includes(BOX_OPEN)) {
|
||||
return text
|
||||
}
|
||||
|
||||
const out: ReactNode[] = []
|
||||
let i = 0
|
||||
let key = 0
|
||||
|
||||
while (i < text.length) {
|
||||
const start = text.indexOf(BOX_OPEN, i)
|
||||
|
||||
if (start < 0) {
|
||||
out.push(text.slice(i))
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
if (start > i) {
|
||||
out.push(text.slice(i, start))
|
||||
}
|
||||
|
||||
const end = text.indexOf(BOX_CLOSE, start + 1)
|
||||
|
||||
if (end < 0) {
|
||||
out.push(text.slice(start))
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
out.push(
|
||||
<Text bold inverse key={key++}>
|
||||
{' '}
|
||||
{text.slice(start + 1, end)}{' '}
|
||||
</Text>
|
||||
)
|
||||
|
||||
i = end + 1
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
const FENCE_RE = /^\s*(`{3,}|~{3,})(.*)$/
|
||||
const FENCE_CLOSE_RE = /^\s*(`{3,}|~{3,})\s*$/
|
||||
const HR_RE = /^ {0,3}([-*_])(?:\s*\1){2,}\s*$/
|
||||
|
|
@ -19,6 +70,15 @@ const QUOTE_RE = /^\s*(?:>\s*)+/
|
|||
const TABLE_DIVIDER_CELL_RE = /^:?-{3,}:?$/
|
||||
const MD_URL_RE = '((?:[^\\s()]|\\([^\\s()]*\\))+?)'
|
||||
|
||||
// Display math openers: `$$ ... $$` (TeX) and `\[ ... \]` (LaTeX). The
|
||||
// opener is matched only when `$$` / `\[` appears at the very start of the
|
||||
// trimmed line — `startsWith('$$')` used to fire on prose like
|
||||
// `$$x+y$$ followed by more`, opening a block that never closed because the
|
||||
// trailing `$$` on the same line was invisible to the close-scan loop.
|
||||
const MATH_BLOCK_OPEN_RE = /^\s*(\$\$|\\\[)(.*)$/
|
||||
const MATH_BLOCK_CLOSE_DOLLAR_RE = /^(.*?)\$\$\s*$/
|
||||
const MATH_BLOCK_CLOSE_BRACKET_RE = /^(.*?)\\\]\s*$/
|
||||
|
||||
export const MEDIA_LINE_RE = /^\s*[`"']?MEDIA:\s*(\S+?)[`"']?\s*$/
|
||||
export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/
|
||||
|
||||
|
|
@ -31,6 +91,13 @@ export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/
|
|||
// `thing ~! more ~?` from Kimi / Qwen / GLM (kaomoji-style decorators)
|
||||
// doesn't pair up the first `~` with the next one on the line and swallow
|
||||
// the text between them as a dim `_`-prefixed span.
|
||||
//
|
||||
// Inline math (`$x$` and `\(x\)`) takes precedence over emphasis at the
|
||||
// same start position because regex alternation is leftmost-first; a
|
||||
// dollar-delimited span at column N wins over a `*` at column N+1, so
|
||||
// `$P=a*b*c$` renders as math instead of having `*b*` corrupted into
|
||||
// italics. Single-character minimums and "no space adjacent to delimiter"
|
||||
// rules keep currency prose like `$5 to $10` from being swallowed.
|
||||
export const INLINE_RE = new RegExp(
|
||||
[
|
||||
`!\\[(.*?)\\]\\(${MD_URL_RE}\\)`, // 1,2 image
|
||||
|
|
@ -46,7 +113,13 @@ export const INLINE_RE = new RegExp(
|
|||
`\\[\\^([^\\]]+)\\]`, // 13 footnote ref
|
||||
`\\^([^^\\s][^^]*?)\\^`, // 14 superscript
|
||||
`~([A-Za-z0-9]{1,8})~`, // 15 subscript
|
||||
`https?:\\/\\/[^\\s<]+` // 16 bare URL
|
||||
`(https?:\\/\\/[^\\s<]+)`, // 16 bare URL — wrapped so it owns its own
|
||||
// capture group; without this, the math
|
||||
// spans below would land in m[16] and the
|
||||
// MdInline dispatcher would treat them as
|
||||
// bare URLs and render them as autolinks.
|
||||
`(?<!\\$)\\$([^\\s$](?:[^$\\n]*?[^\\s$])?)\\$(?!\\$)`, // 17 inline math $...$
|
||||
`\\\\\\(([^\\n]+?)\\\\\\)` // 18 inline math \(...\)
|
||||
].join('|'),
|
||||
'g'
|
||||
)
|
||||
|
|
@ -93,12 +166,14 @@ export const stripInlineMarkup = (v: string) =>
|
|||
.replace(/\[\^([^\]]+)\]/g, '[$1]')
|
||||
.replace(/\^([^^\s][^^]*?)\^/g, '^$1')
|
||||
.replace(/~([A-Za-z0-9]{1,8})~/g, '_$1')
|
||||
.replace(/(?<!\$)\$([^\s$](?:[^$\n]*?[^\s$])?)\$(?!\$)/g, '$1')
|
||||
.replace(/\\\(([^\n]+?)\\\)/g, '$1')
|
||||
|
||||
const renderTable = (k: number, rows: string[][], t: Theme) => {
|
||||
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))
|
||||
|
||||
// Thin divider under the header. Without it tables look like prose
|
||||
// with extra spacing because the header is just amber-coloured text
|
||||
// with extra spacing because the header is just accent-coloured text
|
||||
// (#15534). We avoid full borders on purpose — column widths come
|
||||
// from `stripInlineMarkup(...).length` (UTF-16 code units, not
|
||||
// display width), so a real outline often misaligns on emoji and
|
||||
|
|
@ -163,31 +238,39 @@ function MdInline({ t, text }: { t: Theme; text: string }) {
|
|||
} else if (m[6]) {
|
||||
parts.push(
|
||||
<Text key={parts.length} strikethrough>
|
||||
{m[6]}
|
||||
<MdInline t={t} text={m[6]} />
|
||||
</Text>
|
||||
)
|
||||
} else if (m[7]) {
|
||||
// Code is the one wrap that does NOT recurse — inline `code` spans
|
||||
// are verbatim by definition. Letting MdInline reprocess them
|
||||
// would corrupt regex examples and shell snippets.
|
||||
parts.push(
|
||||
<Text color={t.color.accent} dimColor key={parts.length}>
|
||||
{m[7]}
|
||||
</Text>
|
||||
)
|
||||
} else if (m[8] ?? m[9]) {
|
||||
// Recurse into bold / italic / strike / highlight so nested
|
||||
// `$...$` math (and other inline tokens) inside a `**bolded
|
||||
// statement with $\mathbb{Z}$ math**` actually render. Without
|
||||
// this the inner content is dropped into a single `<Text bold>`
|
||||
// verbatim and the math renderer never sees it.
|
||||
parts.push(
|
||||
<Text bold key={parts.length}>
|
||||
{m[8] ?? m[9]}
|
||||
<MdInline t={t} text={m[8] ?? m[9]!} />
|
||||
</Text>
|
||||
)
|
||||
} else if (m[10] ?? m[11]) {
|
||||
parts.push(
|
||||
<Text italic key={parts.length}>
|
||||
{m[10] ?? m[11]}
|
||||
<MdInline t={t} text={m[10] ?? m[11]!} />
|
||||
</Text>
|
||||
)
|
||||
} else if (m[12]) {
|
||||
parts.push(
|
||||
<Text backgroundColor={t.color.diffAdded} color={t.color.diffAddedWord} key={parts.length}>
|
||||
{m[12]}
|
||||
<MdInline t={t} text={m[12]} />
|
||||
</Text>
|
||||
)
|
||||
} else if (m[13]) {
|
||||
|
|
@ -218,6 +301,19 @@ function MdInline({ t, text }: { t: Theme; text: string }) {
|
|||
if (url.length < m[16].length) {
|
||||
parts.push(<Text key={parts.length}>{m[16].slice(url.length)}</Text>)
|
||||
}
|
||||
} else if (m[17] ?? m[18]) {
|
||||
// Inline math is run through `texToUnicode` (Greek letters, ℕℤℚℝ,
|
||||
// operators, sub/superscripts, fractions) and rendered in italic
|
||||
// accent. Italic is the disambiguator — links use accent+underline,
|
||||
// so without italic readers can't tell `\mathbb{R}` (math) from a
|
||||
// hyperlinked word. Anything `texToUnicode` doesn't recognise is
|
||||
// preserved verbatim, so unfamiliar commands just look like their
|
||||
// raw LaTeX rather than vanishing.
|
||||
parts.push(
|
||||
<Text color={t.color.accent} italic key={parts.length}>
|
||||
{renderMath(texToUnicode(m[17] ?? m[18]!))}
|
||||
</Text>
|
||||
)
|
||||
}
|
||||
|
||||
last = i + m[0].length
|
||||
|
|
@ -415,32 +511,80 @@ function MdImpl({ compact, t, text }: MdProps) {
|
|||
continue
|
||||
}
|
||||
|
||||
if (line.trim().startsWith('$$')) {
|
||||
start('code')
|
||||
const mathOpen = line.match(MATH_BLOCK_OPEN_RE)
|
||||
|
||||
if (mathOpen) {
|
||||
const opener = mathOpen[1]!
|
||||
const closeRe = opener === '$$' ? MATH_BLOCK_CLOSE_DOLLAR_RE : MATH_BLOCK_CLOSE_BRACKET_RE
|
||||
const headRest = mathOpen[2] ?? ''
|
||||
const block: string[] = []
|
||||
|
||||
for (i++; i < lines.length; i++) {
|
||||
if (lines[i]!.trim().startsWith('$$')) {
|
||||
// Single-line block: `$$x + y = z$$` or `\[x\]`. Capture inner content
|
||||
// and emit the block immediately. Without this, the close-scan loop
|
||||
// skips line `i` and treats the next opener as our closer, swallowing
|
||||
// every paragraph in between.
|
||||
const sameLineClose = headRest.match(closeRe)
|
||||
|
||||
if (sameLineClose) {
|
||||
const inner = sameLineClose[1]!.trim()
|
||||
|
||||
start('code')
|
||||
nodes.push(
|
||||
<Box flexDirection="column" key={key} paddingLeft={2}>
|
||||
{inner ? <Text color={t.color.accent}>{renderMath(texToUnicode(inner))}</Text> : null}
|
||||
</Box>
|
||||
)
|
||||
i++
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
// Multi-line block: scan ahead for a real closer before committing.
|
||||
// If none exists in the rest of the document, render this line as a
|
||||
// paragraph instead of consuming everything that follows.
|
||||
let closeIdx = -1
|
||||
|
||||
for (let j = i + 1; j < lines.length; j++) {
|
||||
if (closeRe.test(lines[j]!)) {
|
||||
closeIdx = j
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
block.push(lines[i]!)
|
||||
}
|
||||
|
||||
if (closeIdx < 0) {
|
||||
start('paragraph')
|
||||
nodes.push(<MdInline key={key} t={t} text={line} />)
|
||||
i++
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
if (headRest.trim()) {
|
||||
block.push(headRest)
|
||||
}
|
||||
|
||||
for (let j = i + 1; j < closeIdx; j++) {
|
||||
block.push(lines[j]!)
|
||||
}
|
||||
|
||||
const tail = lines[closeIdx]!.match(closeRe)![1]!.trimEnd()
|
||||
|
||||
if (tail.trim()) {
|
||||
block.push(tail)
|
||||
}
|
||||
|
||||
start('code')
|
||||
nodes.push(
|
||||
<Box flexDirection="column" key={key} paddingLeft={2}>
|
||||
<Text color={t.color.muted}>─ math</Text>
|
||||
|
||||
{block.map((l, j) => (
|
||||
<Text color={t.color.accent} key={j}>
|
||||
{l}
|
||||
{renderMath(texToUnicode(l))}
|
||||
</Text>
|
||||
))}
|
||||
</Box>
|
||||
)
|
||||
i = closeIdx + 1
|
||||
|
||||
continue
|
||||
}
|
||||
|
|
@ -451,7 +595,7 @@ function MdImpl({ compact, t, text }: MdProps) {
|
|||
start('heading')
|
||||
nodes.push(
|
||||
<Text bold color={t.color.accent} key={key}>
|
||||
{heading}
|
||||
<MdInline t={t} text={heading} />
|
||||
</Text>
|
||||
)
|
||||
i++
|
||||
|
|
@ -463,7 +607,7 @@ function MdImpl({ compact, t, text }: MdProps) {
|
|||
start('heading')
|
||||
nodes.push(
|
||||
<Text bold color={t.color.accent} key={key}>
|
||||
{line.trim()}
|
||||
<MdInline t={t} text={line.trim()} />
|
||||
</Text>
|
||||
)
|
||||
i += 2
|
||||
|
|
|
|||
|
|
@ -35,19 +35,60 @@ import type { Theme } from '../theme.js'
|
|||
|
||||
import { Md } from './markdown.js'
|
||||
|
||||
// Count ``` or ~~~ fence toggles in `s` up to `end`. Odd = currently inside
|
||||
// a fenced block; we can't split the prefix there or we'd orphan the fence.
|
||||
// Count ``` / ~~~ AND `$$` / `\[…\]` fence toggles in `s` up to `end`. Odd
|
||||
// = currently inside a fenced block; splitting the prefix there would
|
||||
// orphan the fence and let the unstable suffix re-render as broken
|
||||
// markdown. Math fences only toggle when the code fence is closed so
|
||||
// snippets like ` ```\n$$x$$\n``` ` (math example inside a code block)
|
||||
// don't double-count. A `$$x$$` line that opens AND closes on its own
|
||||
// produces zero net toggles; that's `len >= 4` plus `endsDollar`.
|
||||
//
|
||||
// NB: this is INTENTIONALLY more conservative than `markdown.tsx`'s
|
||||
// parser, which falls back to paragraph rendering when an `$$` opener
|
||||
// has no matching closer. The renderer can do that safely because it
|
||||
// always sees the full text on every call. The streaming chunker
|
||||
// cannot — once a chunk is committed to the monotonic stable prefix it
|
||||
// is frozen, so prematurely deciding "this `$$` is just prose" would
|
||||
// permanently commit a paragraph rendering that becomes wrong the
|
||||
// instant the closer streams in. Treating any unmatched `$$` opener
|
||||
// as still-open keeps the boundary parked behind it until the closer
|
||||
// arrives (or the stream ends and the non-streaming `<Md>` takes over,
|
||||
// at which point the renderer's fallback kicks in correctly).
|
||||
const fenceOpenAt = (s: string, end: number) => {
|
||||
let open = false
|
||||
let codeOpen = false
|
||||
let mathOpen = false
|
||||
let mathOpener: '$$' | '\\[' | null = null
|
||||
let i = 0
|
||||
|
||||
while (i < end) {
|
||||
const nl = s.indexOf('\n', i)
|
||||
const lineEnd = nl < 0 || nl > end ? end : nl
|
||||
const line = s.slice(i, lineEnd)
|
||||
const line = s.slice(i, lineEnd).trim()
|
||||
|
||||
if (/^\s*(?:`{3,}|~{3,})/.test(line)) {
|
||||
open = !open
|
||||
if (/^(?:`{3,}|~{3,})/.test(line)) {
|
||||
codeOpen = !codeOpen
|
||||
} else if (!codeOpen) {
|
||||
if (!mathOpen && /^\$\$/.test(line)) {
|
||||
const isSingleLine = line.length >= 4 && /\$\$$/.test(line)
|
||||
|
||||
if (!isSingleLine) {
|
||||
mathOpen = true
|
||||
mathOpener = '$$'
|
||||
}
|
||||
} else if (!mathOpen && /^\\\[/.test(line)) {
|
||||
const isSingleLine = /\\\]$/.test(line)
|
||||
|
||||
if (!isSingleLine) {
|
||||
mathOpen = true
|
||||
mathOpener = '\\['
|
||||
}
|
||||
} else if (mathOpen && mathOpener === '$$' && /\$\$$/.test(line)) {
|
||||
mathOpen = false
|
||||
mathOpener = null
|
||||
} else if (mathOpen && mathOpener === '\\[' && /\\\]$/.test(line)) {
|
||||
mathOpen = false
|
||||
mathOpener = null
|
||||
}
|
||||
}
|
||||
|
||||
if (nl < 0 || nl >= end) {
|
||||
|
|
@ -57,7 +98,7 @@ const fenceOpenAt = (s: string, end: number) => {
|
|||
i = nl + 1
|
||||
}
|
||||
|
||||
return open
|
||||
return codeOpen || mathOpen
|
||||
}
|
||||
|
||||
// Find the last "\n\n" boundary before `end` that is OUTSIDE a fenced code
|
||||
|
|
|
|||
770
ui-tui/src/lib/mathUnicode.ts
Normal file
770
ui-tui/src/lib/mathUnicode.ts
Normal file
|
|
@ -0,0 +1,770 @@
|
|||
// Best-effort LaTeX → Unicode for inline / display math captured by the
|
||||
// markdown renderer. The terminal can't typeset LaTeX, but Unicode covers
|
||||
// most of what models actually emit: Greek letters, blackboard / fraktur /
|
||||
// calligraphic capitals, set theory + logic operators, common arrows,
|
||||
// sub/superscripts, and `\frac{a}{b}` collapsed to `a/b`.
|
||||
//
|
||||
// Design rules:
|
||||
// • Pure regex pipeline. Anything we don't recognise is preserved
|
||||
// verbatim (so a `\foo{bar}` we've never heard of still survives).
|
||||
// A real LaTeX parser would be more correct but throws on partial
|
||||
// input — terminal users would rather see the raw command than a
|
||||
// parse-error placeholder.
|
||||
// • Longest-match-first ordering on commands so `\le` doesn't shadow
|
||||
// `\leq`, `\sub` doesn't shadow `\subseteq`, etc.
|
||||
// • Word-boundary lookahead `(?![A-Za-z])` after each command so
|
||||
// `\pix` (made-up command) doesn't get partially substituted as `π`.
|
||||
// • `\mathbb{X}`, `\mathcal{X}`, `\mathfrak{X}` only handle a single
|
||||
// letter argument — multi-letter `\mathbb{NN}` is rare and would
|
||||
// need a real parser to do correctly.
|
||||
// • Sub/super scripts only convert if EVERY character has a Unicode
|
||||
// equivalent. Mixed content like `^{n+1}` falls back to the raw
|
||||
// LaTeX so we don't emit `ⁿ+¹` (which has no `+` superscript glyph
|
||||
// in some fonts and reads worse than the source).
|
||||
|
||||
const SYMBOLS: Record<string, string> = {
|
||||
// Greek lowercase
|
||||
'\\alpha': 'α',
|
||||
'\\beta': 'β',
|
||||
'\\gamma': 'γ',
|
||||
'\\delta': 'δ',
|
||||
'\\epsilon': 'ε',
|
||||
'\\varepsilon': 'ε',
|
||||
'\\zeta': 'ζ',
|
||||
'\\eta': 'η',
|
||||
'\\theta': 'θ',
|
||||
'\\vartheta': 'ϑ',
|
||||
'\\iota': 'ι',
|
||||
'\\kappa': 'κ',
|
||||
'\\lambda': 'λ',
|
||||
'\\mu': 'μ',
|
||||
'\\nu': 'ν',
|
||||
'\\xi': 'ξ',
|
||||
'\\pi': 'π',
|
||||
'\\varpi': 'ϖ',
|
||||
'\\rho': 'ρ',
|
||||
'\\varrho': 'ϱ',
|
||||
'\\sigma': 'σ',
|
||||
'\\varsigma': 'ς',
|
||||
'\\tau': 'τ',
|
||||
'\\upsilon': 'υ',
|
||||
'\\phi': 'φ',
|
||||
'\\varphi': 'φ',
|
||||
'\\chi': 'χ',
|
||||
'\\psi': 'ψ',
|
||||
'\\omega': 'ω',
|
||||
|
||||
// Greek uppercase
|
||||
'\\Gamma': 'Γ',
|
||||
'\\Delta': 'Δ',
|
||||
'\\Theta': 'Θ',
|
||||
'\\Lambda': 'Λ',
|
||||
'\\Xi': 'Ξ',
|
||||
'\\Pi': 'Π',
|
||||
'\\Sigma': 'Σ',
|
||||
'\\Upsilon': 'Υ',
|
||||
'\\Phi': 'Φ',
|
||||
'\\Psi': 'Ψ',
|
||||
'\\Omega': 'Ω',
|
||||
|
||||
// Big operators
|
||||
'\\sum': '∑',
|
||||
'\\prod': '∏',
|
||||
'\\coprod': '∐',
|
||||
'\\int': '∫',
|
||||
'\\iint': '∬',
|
||||
'\\iiint': '∭',
|
||||
'\\oint': '∮',
|
||||
'\\bigcup': '⋃',
|
||||
'\\bigcap': '⋂',
|
||||
'\\bigvee': '⋁',
|
||||
'\\bigwedge': '⋀',
|
||||
'\\bigoplus': '⨁',
|
||||
'\\bigotimes': '⨂',
|
||||
|
||||
// Calculus
|
||||
'\\partial': '∂',
|
||||
'\\nabla': '∇',
|
||||
'\\sqrt': '√',
|
||||
|
||||
// Sets
|
||||
'\\emptyset': '∅',
|
||||
'\\varnothing': '∅',
|
||||
'\\infty': '∞',
|
||||
'\\in': '∈',
|
||||
'\\notin': '∉',
|
||||
'\\ni': '∋',
|
||||
'\\subset': '⊂',
|
||||
'\\supset': '⊃',
|
||||
'\\subseteq': '⊆',
|
||||
'\\supseteq': '⊇',
|
||||
'\\subsetneq': '⊊',
|
||||
'\\supsetneq': '⊋',
|
||||
'\\cup': '∪',
|
||||
'\\cap': '∩',
|
||||
'\\setminus': '∖',
|
||||
'\\complement': '∁',
|
||||
|
||||
// Logic
|
||||
'\\forall': '∀',
|
||||
'\\exists': '∃',
|
||||
'\\nexists': '∄',
|
||||
'\\land': '∧',
|
||||
'\\lor': '∨',
|
||||
'\\lnot': '¬',
|
||||
'\\neg': '¬',
|
||||
'\\therefore': '∴',
|
||||
'\\because': '∵',
|
||||
|
||||
// Relations
|
||||
'\\le': '≤',
|
||||
'\\leq': '≤',
|
||||
'\\ge': '≥',
|
||||
'\\geq': '≥',
|
||||
'\\ne': '≠',
|
||||
'\\neq': '≠',
|
||||
'\\ll': '≪',
|
||||
'\\gg': '≫',
|
||||
'\\approx': '≈',
|
||||
'\\equiv': '≡',
|
||||
'\\cong': '≅',
|
||||
'\\sim': '∼',
|
||||
'\\simeq': '≃',
|
||||
'\\propto': '∝',
|
||||
'\\perp': '⊥',
|
||||
'\\parallel': '∥',
|
||||
'\\models': '⊨',
|
||||
'\\vdash': '⊢',
|
||||
'\\mid': '∣',
|
||||
'\\nmid': '∤',
|
||||
'\\divides': '∣',
|
||||
|
||||
// Common standalone glyphs
|
||||
'\\blacksquare': '■',
|
||||
'\\square': '□',
|
||||
'\\Box': '□',
|
||||
'\\qed': '∎',
|
||||
'\\bigstar': '★',
|
||||
|
||||
// Modular arithmetic — the `\pmod{p}` form (with arg) is handled below;
|
||||
// the bare `\bmod` / `\mod` commands are simple text substitutions.
|
||||
'\\bmod': 'mod',
|
||||
'\\mod': 'mod',
|
||||
|
||||
// Brackets / fences (named delimiter commands; the `\left\X` / `\right\X`
|
||||
// unwrapping below leaves these behind for the symbol pass to resolve).
|
||||
'\\langle': '⟨',
|
||||
'\\rangle': '⟩',
|
||||
'\\lceil': '⌈',
|
||||
'\\rceil': '⌉',
|
||||
'\\lfloor': '⌊',
|
||||
'\\rfloor': '⌋',
|
||||
'\\|': '‖',
|
||||
|
||||
// Arrows
|
||||
'\\to': '→',
|
||||
'\\rightarrow': '→',
|
||||
'\\leftarrow': '←',
|
||||
'\\leftrightarrow': '↔',
|
||||
'\\Rightarrow': '⇒',
|
||||
'\\Leftarrow': '⇐',
|
||||
'\\Leftrightarrow': '⇔',
|
||||
'\\implies': '⟹',
|
||||
'\\impliedby': '⟸',
|
||||
'\\iff': '⟺',
|
||||
'\\mapsto': '↦',
|
||||
'\\hookrightarrow': '↪',
|
||||
'\\hookleftarrow': '↩',
|
||||
'\\uparrow': '↑',
|
||||
'\\downarrow': '↓',
|
||||
'\\updownarrow': '↕',
|
||||
|
||||
// Binary operators
|
||||
'\\cdot': '⋅',
|
||||
'\\cdots': '⋯',
|
||||
'\\ldots': '…',
|
||||
'\\dots': '…',
|
||||
'\\dotsb': '…',
|
||||
'\\dotsc': '…',
|
||||
'\\vdots': '⋮',
|
||||
'\\ddots': '⋱',
|
||||
'\\times': '×',
|
||||
'\\div': '÷',
|
||||
'\\pm': '±',
|
||||
'\\mp': '∓',
|
||||
'\\circ': '∘',
|
||||
'\\bullet': '•',
|
||||
'\\star': '⋆',
|
||||
'\\ast': '∗',
|
||||
'\\oplus': '⊕',
|
||||
'\\ominus': '⊖',
|
||||
'\\otimes': '⊗',
|
||||
'\\odot': '⊙',
|
||||
'\\diamond': '⋄',
|
||||
'\\angle': '∠',
|
||||
'\\triangle': '△',
|
||||
|
||||
// Spacing — collapse to varying widths of regular space
|
||||
'\\,': ' ',
|
||||
'\\;': ' ',
|
||||
'\\:': ' ',
|
||||
'\\!': '',
|
||||
'\\ ': ' ',
|
||||
'\\quad': ' ',
|
||||
'\\qquad': ' ',
|
||||
|
||||
// Functions (LaTeX renders these in roman; we just keep the name)
|
||||
'\\sin': 'sin',
|
||||
'\\cos': 'cos',
|
||||
'\\tan': 'tan',
|
||||
'\\cot': 'cot',
|
||||
'\\sec': 'sec',
|
||||
'\\csc': 'csc',
|
||||
'\\arcsin': 'arcsin',
|
||||
'\\arccos': 'arccos',
|
||||
'\\arctan': 'arctan',
|
||||
'\\sinh': 'sinh',
|
||||
'\\cosh': 'cosh',
|
||||
'\\tanh': 'tanh',
|
||||
'\\log': 'log',
|
||||
'\\ln': 'ln',
|
||||
'\\exp': 'exp',
|
||||
'\\det': 'det',
|
||||
'\\dim': 'dim',
|
||||
'\\ker': 'ker',
|
||||
'\\lim': 'lim',
|
||||
'\\liminf': 'liminf',
|
||||
'\\limsup': 'limsup',
|
||||
'\\sup': 'sup',
|
||||
'\\inf': 'inf',
|
||||
'\\max': 'max',
|
||||
'\\min': 'min',
|
||||
'\\arg': 'arg',
|
||||
'\\gcd': 'gcd',
|
||||
|
||||
// Escaped literals — model occasionally emits these for display
|
||||
'\\&': '&',
|
||||
'\\%': '%',
|
||||
'\\$': '$',
|
||||
'\\#': '#',
|
||||
'\\_': '_',
|
||||
'\\{': '{',
|
||||
'\\}': '}'
|
||||
}
|
||||
|
||||
const BB: Record<string, string> = {
|
||||
A: '𝔸',
|
||||
B: '𝔹',
|
||||
C: 'ℂ',
|
||||
D: '𝔻',
|
||||
E: '𝔼',
|
||||
F: '𝔽',
|
||||
G: '𝔾',
|
||||
H: 'ℍ',
|
||||
I: '𝕀',
|
||||
J: '𝕁',
|
||||
K: '𝕂',
|
||||
L: '𝕃',
|
||||
M: '𝕄',
|
||||
N: 'ℕ',
|
||||
O: '𝕆',
|
||||
P: 'ℙ',
|
||||
Q: 'ℚ',
|
||||
R: 'ℝ',
|
||||
S: '𝕊',
|
||||
T: '𝕋',
|
||||
U: '𝕌',
|
||||
V: '𝕍',
|
||||
W: '𝕎',
|
||||
X: '𝕏',
|
||||
Y: '𝕐',
|
||||
Z: 'ℤ'
|
||||
}
|
||||
|
||||
const CAL: Record<string, string> = {
|
||||
A: '𝒜',
|
||||
B: 'ℬ',
|
||||
C: '𝒞',
|
||||
D: '𝒟',
|
||||
E: 'ℰ',
|
||||
F: 'ℱ',
|
||||
G: '𝒢',
|
||||
H: 'ℋ',
|
||||
I: 'ℐ',
|
||||
J: '𝒥',
|
||||
K: '𝒦',
|
||||
L: 'ℒ',
|
||||
M: 'ℳ',
|
||||
N: '𝒩',
|
||||
O: '𝒪',
|
||||
P: '𝒫',
|
||||
Q: '𝒬',
|
||||
R: 'ℛ',
|
||||
S: '𝒮',
|
||||
T: '𝒯',
|
||||
U: '𝒰',
|
||||
V: '𝒱',
|
||||
W: '𝒲',
|
||||
X: '𝒳',
|
||||
Y: '𝒴',
|
||||
Z: '𝒵'
|
||||
}
|
||||
|
||||
const FRAK: Record<string, string> = {
|
||||
A: '𝔄',
|
||||
B: '𝔅',
|
||||
C: 'ℭ',
|
||||
D: '𝔇',
|
||||
E: '𝔈',
|
||||
F: '𝔉',
|
||||
G: '𝔊',
|
||||
H: 'ℌ',
|
||||
I: 'ℑ',
|
||||
J: '𝔍',
|
||||
K: '𝔎',
|
||||
L: '𝔏',
|
||||
M: '𝔐',
|
||||
N: '𝔑',
|
||||
O: '𝔒',
|
||||
P: '𝔓',
|
||||
Q: '𝔔',
|
||||
R: 'ℜ',
|
||||
S: '𝔖',
|
||||
T: '𝔗',
|
||||
U: '𝔘',
|
||||
V: '𝔙',
|
||||
W: '𝔚',
|
||||
X: '𝔛',
|
||||
Y: '𝔜',
|
||||
Z: 'ℨ'
|
||||
}
|
||||
|
||||
const SUPERSCRIPT: Record<string, string> = {
|
||||
'0': '⁰',
|
||||
'1': '¹',
|
||||
'2': '²',
|
||||
'3': '³',
|
||||
'4': '⁴',
|
||||
'5': '⁵',
|
||||
'6': '⁶',
|
||||
'7': '⁷',
|
||||
'8': '⁸',
|
||||
'9': '⁹',
|
||||
'+': '⁺',
|
||||
'-': '⁻',
|
||||
'=': '⁼',
|
||||
'(': '⁽',
|
||||
')': '⁾',
|
||||
a: 'ᵃ',
|
||||
b: 'ᵇ',
|
||||
c: 'ᶜ',
|
||||
d: 'ᵈ',
|
||||
e: 'ᵉ',
|
||||
f: 'ᶠ',
|
||||
g: 'ᵍ',
|
||||
h: 'ʰ',
|
||||
i: 'ⁱ',
|
||||
j: 'ʲ',
|
||||
k: 'ᵏ',
|
||||
l: 'ˡ',
|
||||
m: 'ᵐ',
|
||||
n: 'ⁿ',
|
||||
o: 'ᵒ',
|
||||
p: 'ᵖ',
|
||||
r: 'ʳ',
|
||||
s: 'ˢ',
|
||||
t: 'ᵗ',
|
||||
u: 'ᵘ',
|
||||
v: 'ᵛ',
|
||||
w: 'ʷ',
|
||||
x: 'ˣ',
|
||||
y: 'ʸ',
|
||||
z: 'ᶻ'
|
||||
}
|
||||
|
||||
const SUBSCRIPT: Record<string, string> = {
|
||||
'0': '₀',
|
||||
'1': '₁',
|
||||
'2': '₂',
|
||||
'3': '₃',
|
||||
'4': '₄',
|
||||
'5': '₅',
|
||||
'6': '₆',
|
||||
'7': '₇',
|
||||
'8': '₈',
|
||||
'9': '₉',
|
||||
'+': '₊',
|
||||
'-': '₋',
|
||||
'=': '₌',
|
||||
'(': '₍',
|
||||
')': '₎',
|
||||
a: 'ₐ',
|
||||
e: 'ₑ',
|
||||
h: 'ₕ',
|
||||
i: 'ᵢ',
|
||||
j: 'ⱼ',
|
||||
k: 'ₖ',
|
||||
l: 'ₗ',
|
||||
m: 'ₘ',
|
||||
n: 'ₙ',
|
||||
o: 'ₒ',
|
||||
p: 'ₚ',
|
||||
r: 'ᵣ',
|
||||
s: 'ₛ',
|
||||
t: 'ₜ',
|
||||
u: 'ᵤ',
|
||||
v: 'ᵥ',
|
||||
x: 'ₓ'
|
||||
}
|
||||
|
||||
// Sentinel control characters used to mark `\boxed` / `\fbox` regions in
|
||||
// the converted output. The renderer splits on these to apply a highlight
|
||||
// style; consumers that don't want highlighting can strip them with the
|
||||
// exported `BOX_RE` below.
|
||||
export const BOX_OPEN = '\u0001'
|
||||
export const BOX_CLOSE = '\u0002'
|
||||
export const BOX_RE = /\u0001([^\u0001\u0002]*)\u0002/g
|
||||
|
||||
const escapeRe = (s: string) => s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
|
||||
|
||||
// Pre-compile two symbol regexes: one for letter-ending commands (`\pi`,
|
||||
// `\sum`) which need a `(?![A-Za-z])` lookahead so they don't partially
|
||||
// match `\pix` or `\summa`, and one for punctuation-ending commands
|
||||
// (`\{`, `\,`, `\|`) which must NOT have the lookahead — otherwise
|
||||
// `\{p` would refuse to substitute because `p` is a letter.
|
||||
//
|
||||
// Longest commands first inside each group so `\leq` beats `\le`.
|
||||
const splitByEnding = (keys: string[]) => {
|
||||
const letter: string[] = []
|
||||
const punct: string[] = []
|
||||
|
||||
for (const k of keys) {
|
||||
if (/[A-Za-z]$/.test(k)) {
|
||||
letter.push(k)
|
||||
} else {
|
||||
punct.push(k)
|
||||
}
|
||||
}
|
||||
|
||||
return { letter, punct }
|
||||
}
|
||||
|
||||
const buildAlt = (cmds: string[]) =>
|
||||
cmds
|
||||
.sort((a, b) => b.length - a.length)
|
||||
.map(escapeRe)
|
||||
.join('|')
|
||||
|
||||
const { letter: LETTER_CMDS, punct: PUNCT_CMDS } = splitByEnding(Object.keys(SYMBOLS))
|
||||
|
||||
const SYMBOL_LETTER_RE = new RegExp('(?:' + buildAlt(LETTER_CMDS) + ')(?![A-Za-z])', 'g')
|
||||
const SYMBOL_PUNCT_RE = new RegExp('(?:' + buildAlt(PUNCT_CMDS) + ')', 'g')
|
||||
|
||||
const convertScript = (input: string, table: Record<string, string>, sigil: '^' | '_'): string => {
|
||||
let out = ''
|
||||
let allMapped = true
|
||||
|
||||
for (const ch of input) {
|
||||
const mapped = table[ch]
|
||||
|
||||
if (!mapped) {
|
||||
allMapped = false
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
out += mapped
|
||||
}
|
||||
|
||||
if (allMapped) {
|
||||
return out
|
||||
}
|
||||
|
||||
// Fallback: if the body is a single visible character (e.g. `∞` after
|
||||
// earlier symbol substitution), render it without braces — `^∞` reads
|
||||
// far better than `^{∞}` in a terminal. Multi-char bodies that don't
|
||||
// fully convert use parens (`e^(iπ)`) instead of braces (`e^{iπ}`)
|
||||
// because parens are normal punctuation while braces look like
|
||||
// unrendered LaTeX.
|
||||
const trimmed = input.trim()
|
||||
|
||||
if ([...trimmed].length === 1) {
|
||||
return `${sigil}${trimmed}`
|
||||
}
|
||||
|
||||
return `${sigil}(${trimmed})`
|
||||
}
|
||||
|
||||
// Walk the string and parse `{...}` honouring nested braces. Unlike a
|
||||
// `\{[^{}]*\}` regex this survives `\frac{|t|^{p-1}|P(t)|^p}{...}` where
|
||||
// the numerator contains its own braces from a superscript. Returns the
|
||||
// inner content (without the outer braces) and the offset just past the
|
||||
// closing `}`. Returns null if there is no balanced brace at `start`.
|
||||
const readBraced = (s: string, start: number): { content: string; end: number } | null => {
|
||||
if (s[start] !== '{') {
|
||||
return null
|
||||
}
|
||||
|
||||
let depth = 1
|
||||
let i = start + 1
|
||||
|
||||
while (i < s.length && depth > 0) {
|
||||
const c = s[i]
|
||||
|
||||
// Skip escapes — `\{` and `\}` inside a body are literal braces and
|
||||
// should not change the brace counter.
|
||||
if (c === '\\' && i + 1 < s.length) {
|
||||
i += 2
|
||||
continue
|
||||
}
|
||||
|
||||
if (c === '{') {
|
||||
depth++
|
||||
} else if (c === '}') {
|
||||
depth--
|
||||
}
|
||||
|
||||
if (depth > 0) {
|
||||
i++
|
||||
}
|
||||
}
|
||||
|
||||
if (depth !== 0) {
|
||||
return null
|
||||
}
|
||||
|
||||
return { content: s.slice(start + 1, i), end: i + 1 }
|
||||
}
|
||||
|
||||
// Replace every occurrence of `\command{arg}` using balanced-brace parsing
|
||||
// (so `\boxed{x^{n+1}}` works where a `[^{}]*` regex would fail). The
|
||||
// `render` callback receives the inner content already recursed-into, so
|
||||
// `\boxed{\boxed{x}}` resolves outside-in cleanly. Unmatched `\command`
|
||||
// (no following `{...}`) is preserved verbatim.
|
||||
const replaceBracedCommand = (input: string, command: string, render: (content: string) => string): string => {
|
||||
const cmdLen = command.length
|
||||
let out = ''
|
||||
let i = 0
|
||||
|
||||
while (i < input.length) {
|
||||
const idx = input.indexOf(command, i)
|
||||
|
||||
if (idx < 0) {
|
||||
out += input.slice(i)
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
const after = input[idx + cmdLen]
|
||||
|
||||
if (after && /[A-Za-z]/.test(after)) {
|
||||
out += input.slice(i, idx + cmdLen)
|
||||
i = idx + cmdLen
|
||||
continue
|
||||
}
|
||||
|
||||
out += input.slice(i, idx)
|
||||
|
||||
let p = idx + cmdLen
|
||||
|
||||
while (input[p] === ' ' || input[p] === '\t') p++
|
||||
|
||||
const arg = readBraced(input, p)
|
||||
|
||||
if (!arg) {
|
||||
out += input.slice(idx, p + 1)
|
||||
i = p + 1
|
||||
continue
|
||||
}
|
||||
|
||||
out += render(replaceBracedCommand(arg.content, command, render))
|
||||
i = arg.end
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// Replace every `\frac{num}{den}` with `num/den` (parens around either
|
||||
// side when its precedence demands it). The recursion handles nested
|
||||
// fractions naturally: `\frac{1}{\frac{1}{x}}` collapses to `1/(1/x)`
|
||||
// because we recurse into `den` before deciding whether to parenthesise.
|
||||
const replaceFracs = (input: string): string => {
|
||||
let out = ''
|
||||
let i = 0
|
||||
|
||||
while (i < input.length) {
|
||||
const idx = input.indexOf('\\frac', i)
|
||||
|
||||
if (idx < 0) {
|
||||
out += input.slice(i)
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
const after = input[idx + 5]
|
||||
|
||||
// `(?![A-Za-z])` — protect hypothetical commands like `\fraction`.
|
||||
if (after && /[A-Za-z]/.test(after)) {
|
||||
out += input.slice(i, idx + 5)
|
||||
i = idx + 5
|
||||
continue
|
||||
}
|
||||
|
||||
out += input.slice(i, idx)
|
||||
|
||||
let p = idx + 5
|
||||
|
||||
while (input[p] === ' ' || input[p] === '\t') p++
|
||||
|
||||
const num = readBraced(input, p)
|
||||
|
||||
if (!num) {
|
||||
out += input.slice(idx, p + 1)
|
||||
i = p + 1
|
||||
continue
|
||||
}
|
||||
|
||||
p = num.end
|
||||
|
||||
while (input[p] === ' ' || input[p] === '\t') p++
|
||||
|
||||
const den = readBraced(input, p)
|
||||
|
||||
if (!den) {
|
||||
out += input.slice(idx, p + 1)
|
||||
i = p + 1
|
||||
continue
|
||||
}
|
||||
|
||||
out += `${wrapForFrac(replaceFracs(num.content))}/${wrapForFrac(replaceFracs(den.content))}`
|
||||
i = den.end
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// Wrap multi-token expressions in parens so `\frac{a+b}{c}` becomes
|
||||
// `(a+b)/c` rather than `a+b/c`. We wrap whenever inline `/` would
|
||||
// change the meaning — that's any binary operator (`+`, `-`, `*`, `/`)
|
||||
// or whitespace separating tokens. `*` and `/` matter because nested
|
||||
// fractions and products like `\frac{a*b}{c}` and `\frac{1/x}{y}` would
|
||||
// otherwise read as `a*b/c` (right-associative ambiguity) and `1/x/y`.
|
||||
// Atomic factors like `n!`, `x^2`, `\sin x` don't trigger any of these
|
||||
// and stay un-parenthesised — wrapping them just clutters the output.
|
||||
const wrapForFrac = (expr: string) => {
|
||||
const trimmed = expr.trim()
|
||||
|
||||
if (!trimmed) {
|
||||
return trimmed
|
||||
}
|
||||
|
||||
if (/^\(.*\)$/.test(trimmed)) {
|
||||
return trimmed
|
||||
}
|
||||
|
||||
if (/[+\-/*]|\s/.test(trimmed)) {
|
||||
return `(${trimmed})`
|
||||
}
|
||||
|
||||
return trimmed
|
||||
}
|
||||
|
||||
export function texToUnicode(input: string): string {
|
||||
let s = input
|
||||
|
||||
s = s.replace(/\\mathbb\s*\{([A-Za-z])\}/g, (raw, c: string) => BB[c] ?? raw)
|
||||
s = s.replace(/\\mathcal\s*\{([A-Za-z])\}/g, (raw, c: string) => CAL[c] ?? raw)
|
||||
s = s.replace(/\\mathfrak\s*\{([A-Za-z])\}/g, (raw, c: string) => FRAK[c] ?? raw)
|
||||
s = s.replace(/\\mathbf\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||||
s = s.replace(/\\mathit\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||||
s = s.replace(/\\mathrm\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||||
s = s.replace(/\\text\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||||
s = s.replace(/\\operatorname\s*\{([^{}]+)\}/g, (_, c: string) => c)
|
||||
|
||||
s = s.replace(/\\overline\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0305`)
|
||||
s = s.replace(/\\hat\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0302`)
|
||||
s = s.replace(/\\bar\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0304`)
|
||||
s = s.replace(/\\tilde\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0303`)
|
||||
s = s.replace(/\\vec\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u20D7`)
|
||||
s = s.replace(/\\dot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0307`)
|
||||
s = s.replace(/\\ddot\s*\{([^{}]+)\}/g, (_, c: string) => `${c}\u0308`)
|
||||
|
||||
s = replaceFracs(s)
|
||||
|
||||
// `\boxed{X}` / `\fbox{X}` highlight a final answer. Terminals can't
|
||||
// draw a real box, so we wrap the content in U+0001 / U+0002 control
|
||||
// characters — non-printable, never present in real text — and let the
|
||||
// markdown renderer split on them and apply a highlight style (inverse
|
||||
// video) to the bracketed region. This keeps `texToUnicode` pure-string
|
||||
// while letting the React layer do the actual visual emphasis.
|
||||
// Argument is parsed with balanced braces so nested `{...}` from
|
||||
// superscripts / fractions inside the box survive.
|
||||
s = replaceBracedCommand(s, '\\boxed', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`)
|
||||
s = replaceBracedCommand(s, '\\fbox', body => `${BOX_OPEN}${body.trim()}${BOX_CLOSE}`)
|
||||
|
||||
// `\xrightarrow{label}` / `\xleftarrow{label}` collapse to an arrow with
|
||||
// the label inline. LaTeX renders the label above the arrow; in monospace
|
||||
// we put it adjacent — `─label→` is the closest readable approximation.
|
||||
// Run before the symbol pass so the label can still pick up Greek and
|
||||
// operator substitutions afterwards.
|
||||
s = s.replace(/\\xrightarrow\s*\{([^{}]*)\}/g, (_, label: string) => `─${label.trim()}→`)
|
||||
s = s.replace(/\\xleftarrow\s*\{([^{}]*)\}/g, (_, label: string) => `←${label.trim()}─`)
|
||||
s = s.replace(/\\Longrightarrow/g, '⟹')
|
||||
s = s.replace(/\\Longleftarrow/g, '⟸')
|
||||
s = s.replace(/\\Longleftrightarrow/g, '⟺')
|
||||
|
||||
// `\pmod{p}` → ` (mod p)` (LaTeX adds parens automatically); `\pod{p}`
|
||||
// is a paren-less variant; `\tag{n}` is the equation-number annotation
|
||||
// shown to the right of an equation. Collapse to a single-space-prefixed
|
||||
// bracketed form. The leading `\s*` in the pattern absorbs any whitespace
|
||||
// already in the source so we don't end up with `b (mod p)` (double
|
||||
// space) when the user wrote `b \pmod{p}`.
|
||||
s = s.replace(/\s*\\pmod\s*\{([^{}]*)\}/g, (_, p: string) => ` (mod ${p.trim()})`)
|
||||
s = s.replace(/\s*\\pod\s*\{([^{}]*)\}/g, (_, p: string) => ` (${p.trim()})`)
|
||||
s = s.replace(/\s*\\tag\s*\{([^{}]*)\}/g, (_, n: string) => ` (${n.trim()})`)
|
||||
|
||||
// `\big`, `\Big`, `\bigg`, `\Bigg` (with optional `l`/`r`/`m` suffix)
|
||||
// are sizing wrappers analogous to `\left`/`\right` but without the
|
||||
// automatic-pairing semantics. Strip them and leave whatever delimiter
|
||||
// follows. The trailing `(?![A-Za-z])` protects `\bigtriangleup` and
|
||||
// any other letter-continuation command from being shaved.
|
||||
s = s.replace(/\\(?:Bigg|bigg|Big|big)[lrm]?(?![A-Za-z])/g, '')
|
||||
|
||||
// Style / size hints that don't typeset any glyph and only affect how
|
||||
// things would be sized in a real LaTeX engine. In a terminal every
|
||||
// glyph is one monospace cell, so there's nothing to do — drop them
|
||||
// (with any trailing whitespace) so they don't leak through as raw
|
||||
// `\displaystyle` in the output.
|
||||
s = s.replace(/\\(?:scriptscriptstyle|displaystyle|scriptstyle|textstyle|nolimits|limits)(?![A-Za-z])\s*/g, '')
|
||||
|
||||
// `\left` and `\right` are sizing wrappers around any delimiter — bare
|
||||
// (`\left(`), escaped (`\left\{`), or named (`\left\langle`). Strip the
|
||||
// wrapper unconditionally and let the rest of the pipeline (or the
|
||||
// upcoming symbol pass) handle whatever delimiter follows. The optional
|
||||
// `.?` consumes `\left.` / `\right.` which mean "no delimiter".
|
||||
// Lookahead `(?![A-Za-z])` keeps `\leftarrow` / `\leftrightarrow` safe.
|
||||
s = s.replace(/\\left(?![A-Za-z])\.?/g, '')
|
||||
s = s.replace(/\\right(?![A-Za-z])\.?/g, '')
|
||||
|
||||
// Run symbol substitution BEFORE scripts so a body like `^{\infty}`
|
||||
// becomes `^{∞}` first; convertScript can then either map ∞ to a
|
||||
// superscript (it can't — Unicode lacks one) or fall back to `^∞`
|
||||
// by stripping braces around the now-single-character body.
|
||||
//
|
||||
// Punctuation pass first — these can be followed by letters (`\{p`
|
||||
// is "open-brace then p"), so the letter pass's `(?![A-Za-z])` rule
|
||||
// would wrongly block them.
|
||||
s = s.replace(SYMBOL_PUNCT_RE, m => SYMBOLS[m] ?? m)
|
||||
s = s.replace(SYMBOL_LETTER_RE, m => SYMBOLS[m] ?? m)
|
||||
|
||||
// Bare `^c` / `_c` handles ONLY alphanumerics and `+`/`-`/`=`. Parens
|
||||
// are intentionally excluded because the braced-fallback above can
|
||||
// emit `(...)` and we don't want a second pass to greedily convert
|
||||
// its opening paren into `⁽` and orphan the closing one.
|
||||
s = s.replace(/\^\s*\{([^{}]+)\}/g, (_, body: string) => convertScript(body, SUPERSCRIPT, '^'))
|
||||
s = s.replace(/\^([A-Za-z0-9+\-=])/g, (raw, ch: string) => SUPERSCRIPT[ch] ?? raw)
|
||||
s = s.replace(/_\s*\{([^{}]+)\}/g, (_, body: string) => convertScript(body, SUBSCRIPT, '_'))
|
||||
s = s.replace(/_([A-Za-z0-9+\-=])/g, (raw, ch: string) => SUBSCRIPT[ch] ?? raw)
|
||||
|
||||
return s
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue