Merge pull request #17175 from NousResearch/fix/markdown

feat(latex): latex in tui
This commit is contained in:
Austin Pickett 2026-04-29 10:18:17 -07:00 committed by GitHub
commit 430302c197
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 1391 additions and 50 deletions

View file

@ -2,9 +2,60 @@ import { Box, Link, Text } from '@hermes/ink'
import { Fragment, memo, type ReactNode, useMemo } from 'react'
import { ensureEmojiPresentation } from '../lib/emoji.js'
import { BOX_CLOSE, BOX_OPEN, texToUnicode } from '../lib/mathUnicode.js'
import { highlightLine, isHighlightable } from '../lib/syntax.js'
import type { Theme } from '../theme.js'
// `\boxed{X}` regions in `texToUnicode` output are marked with the
// non-printable U+0001 / U+0002 sentinels. Split on them and render the
// boxed segment with `inverse + bold` so it reads as a highlighter-pen
// emphasis on top of whatever color the parent `<Text>` is using (the
// theme accent for math). The leading / trailing space inside the
// highlight gives a one-cell visual margin so the highlight reads as a
// block, not a hug.
const renderMath = (text: string): ReactNode => {
if (!text.includes(BOX_OPEN)) {
return text
}
const out: ReactNode[] = []
let i = 0
let key = 0
while (i < text.length) {
const start = text.indexOf(BOX_OPEN, i)
if (start < 0) {
out.push(text.slice(i))
break
}
if (start > i) {
out.push(text.slice(i, start))
}
const end = text.indexOf(BOX_CLOSE, start + 1)
if (end < 0) {
out.push(text.slice(start))
break
}
out.push(
<Text bold inverse key={key++}>
{' '}
{text.slice(start + 1, end)}{' '}
</Text>
)
i = end + 1
}
return out
}
const FENCE_RE = /^\s*(`{3,}|~{3,})(.*)$/
const FENCE_CLOSE_RE = /^\s*(`{3,}|~{3,})\s*$/
const HR_RE = /^ {0,3}([-*_])(?:\s*\1){2,}\s*$/
@ -19,6 +70,15 @@ const QUOTE_RE = /^\s*(?:>\s*)+/
const TABLE_DIVIDER_CELL_RE = /^:?-{3,}:?$/
const MD_URL_RE = '((?:[^\\s()]|\\([^\\s()]*\\))+?)'
// Display math openers: `$$ ... $$` (TeX) and `\[ ... \]` (LaTeX). The
// opener is matched only when `$$` / `\[` appears at the very start of the
// trimmed line — `startsWith('$$')` used to fire on prose like
// `$$x+y$$ followed by more`, opening a block that never closed because the
// trailing `$$` on the same line was invisible to the close-scan loop.
const MATH_BLOCK_OPEN_RE = /^\s*(\$\$|\\\[)(.*)$/
const MATH_BLOCK_CLOSE_DOLLAR_RE = /^(.*?)\$\$\s*$/
const MATH_BLOCK_CLOSE_BRACKET_RE = /^(.*?)\\\]\s*$/
export const MEDIA_LINE_RE = /^\s*[`"']?MEDIA:\s*(\S+?)[`"']?\s*$/
export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/
@ -31,6 +91,13 @@ export const AUDIO_DIRECTIVE_RE = /^\s*\[\[audio_as_voice\]\]\s*$/
// `thing ~! more ~?` from Kimi / Qwen / GLM (kaomoji-style decorators)
// doesn't pair up the first `~` with the next one on the line and swallow
// the text between them as a dim `_`-prefixed span.
//
// Inline math (`$x$` and `\(x\)`) takes precedence over emphasis at the
// same start position because regex alternation is leftmost-first; a
// dollar-delimited span at column N wins over a `*` at column N+1, so
// `$P=a*b*c$` renders as math instead of having `*b*` corrupted into
// italics. Single-character minimums and "no space adjacent to delimiter"
// rules keep currency prose like `$5 to $10` from being swallowed.
export const INLINE_RE = new RegExp(
[
`!\\[(.*?)\\]\\(${MD_URL_RE}\\)`, // 1,2 image
@ -46,7 +113,13 @@ export const INLINE_RE = new RegExp(
`\\[\\^([^\\]]+)\\]`, // 13 footnote ref
`\\^([^^\\s][^^]*?)\\^`, // 14 superscript
`~([A-Za-z0-9]{1,8})~`, // 15 subscript
`https?:\\/\\/[^\\s<]+` // 16 bare URL
`(https?:\\/\\/[^\\s<]+)`, // 16 bare URL — wrapped so it owns its own
// capture group; without this, the math
// spans below would land in m[16] and the
// MdInline dispatcher would treat them as
// bare URLs and render them as autolinks.
`(?<!\\$)\\$([^\\s$](?:[^$\\n]*?[^\\s$])?)\\$(?!\\$)`, // 17 inline math $...$
`\\\\\\(([^\\n]+?)\\\\\\)` // 18 inline math \(...\)
].join('|'),
'g'
)
@ -93,12 +166,14 @@ export const stripInlineMarkup = (v: string) =>
.replace(/\[\^([^\]]+)\]/g, '[$1]')
.replace(/\^([^^\s][^^]*?)\^/g, '^$1')
.replace(/~([A-Za-z0-9]{1,8})~/g, '_$1')
.replace(/(?<!\$)\$([^\s$](?:[^$\n]*?[^\s$])?)\$(?!\$)/g, '$1')
.replace(/\\\(([^\n]+?)\\\)/g, '$1')
const renderTable = (k: number, rows: string[][], t: Theme) => {
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))
// Thin divider under the header. Without it tables look like prose
// with extra spacing because the header is just amber-coloured text
// with extra spacing because the header is just accent-coloured text
// (#15534). We avoid full borders on purpose — column widths come
// from `stripInlineMarkup(...).length` (UTF-16 code units, not
// display width), so a real outline often misaligns on emoji and
@ -163,31 +238,39 @@ function MdInline({ t, text }: { t: Theme; text: string }) {
} else if (m[6]) {
parts.push(
<Text key={parts.length} strikethrough>
{m[6]}
<MdInline t={t} text={m[6]} />
</Text>
)
} else if (m[7]) {
// Code is the one wrap that does NOT recurse — inline `code` spans
// are verbatim by definition. Letting MdInline reprocess them
// would corrupt regex examples and shell snippets.
parts.push(
<Text color={t.color.accent} dimColor key={parts.length}>
{m[7]}
</Text>
)
} else if (m[8] ?? m[9]) {
// Recurse into bold / italic / strike / highlight so nested
// `$...$` math (and other inline tokens) inside a `**bolded
// statement with $\mathbb{Z}$ math**` actually render. Without
// this the inner content is dropped into a single `<Text bold>`
// verbatim and the math renderer never sees it.
parts.push(
<Text bold key={parts.length}>
{m[8] ?? m[9]}
<MdInline t={t} text={m[8] ?? m[9]!} />
</Text>
)
} else if (m[10] ?? m[11]) {
parts.push(
<Text italic key={parts.length}>
{m[10] ?? m[11]}
<MdInline t={t} text={m[10] ?? m[11]!} />
</Text>
)
} else if (m[12]) {
parts.push(
<Text backgroundColor={t.color.diffAdded} color={t.color.diffAddedWord} key={parts.length}>
{m[12]}
<MdInline t={t} text={m[12]} />
</Text>
)
} else if (m[13]) {
@ -218,6 +301,19 @@ function MdInline({ t, text }: { t: Theme; text: string }) {
if (url.length < m[16].length) {
parts.push(<Text key={parts.length}>{m[16].slice(url.length)}</Text>)
}
} else if (m[17] ?? m[18]) {
// Inline math is run through `texToUnicode` (Greek letters, ,
// operators, sub/superscripts, fractions) and rendered in italic
// accent. Italic is the disambiguator — links use accent+underline,
// so without italic readers can't tell `\mathbb{R}` (math) from a
// hyperlinked word. Anything `texToUnicode` doesn't recognise is
// preserved verbatim, so unfamiliar commands just look like their
// raw LaTeX rather than vanishing.
parts.push(
<Text color={t.color.accent} italic key={parts.length}>
{renderMath(texToUnicode(m[17] ?? m[18]!))}
</Text>
)
}
last = i + m[0].length
@ -415,32 +511,80 @@ function MdImpl({ compact, t, text }: MdProps) {
continue
}
if (line.trim().startsWith('$$')) {
start('code')
const mathOpen = line.match(MATH_BLOCK_OPEN_RE)
if (mathOpen) {
const opener = mathOpen[1]!
const closeRe = opener === '$$' ? MATH_BLOCK_CLOSE_DOLLAR_RE : MATH_BLOCK_CLOSE_BRACKET_RE
const headRest = mathOpen[2] ?? ''
const block: string[] = []
for (i++; i < lines.length; i++) {
if (lines[i]!.trim().startsWith('$$')) {
i++
// Single-line block: `$$x + y = z$$` or `\[x\]`. Capture inner content
// and emit the block immediately. Without this, the close-scan loop
// skips line `i` and treats the next opener as our closer, swallowing
// every paragraph in between.
const sameLineClose = headRest.match(closeRe)
if (sameLineClose) {
const inner = sameLineClose[1]!.trim()
start('code')
nodes.push(
<Box flexDirection="column" key={key} paddingLeft={2}>
{inner ? <Text color={t.color.accent}>{renderMath(texToUnicode(inner))}</Text> : null}
</Box>
)
i++
continue
}
// Multi-line block: scan ahead for a real closer before committing.
// If none exists in the rest of the document, render this line as a
// paragraph instead of consuming everything that follows.
let closeIdx = -1
for (let j = i + 1; j < lines.length; j++) {
if (closeRe.test(lines[j]!)) {
closeIdx = j
break
}
block.push(lines[i]!)
}
if (closeIdx < 0) {
start('paragraph')
nodes.push(<MdInline key={key} t={t} text={line} />)
i++
continue
}
if (headRest.trim()) {
block.push(headRest)
}
for (let j = i + 1; j < closeIdx; j++) {
block.push(lines[j]!)
}
const tail = lines[closeIdx]!.match(closeRe)![1]!.trimEnd()
if (tail.trim()) {
block.push(tail)
}
start('code')
nodes.push(
<Box flexDirection="column" key={key} paddingLeft={2}>
<Text color={t.color.muted}> math</Text>
{block.map((l, j) => (
<Text color={t.color.accent} key={j}>
{l}
{renderMath(texToUnicode(l))}
</Text>
))}
</Box>
)
i = closeIdx + 1
continue
}
@ -451,7 +595,7 @@ function MdImpl({ compact, t, text }: MdProps) {
start('heading')
nodes.push(
<Text bold color={t.color.accent} key={key}>
{heading}
<MdInline t={t} text={heading} />
</Text>
)
i++
@ -463,7 +607,7 @@ function MdImpl({ compact, t, text }: MdProps) {
start('heading')
nodes.push(
<Text bold color={t.color.accent} key={key}>
{line.trim()}
<MdInline t={t} text={line.trim()} />
</Text>
)
i += 2

View file

@ -35,19 +35,60 @@ import type { Theme } from '../theme.js'
import { Md } from './markdown.js'
// Count ``` or ~~~ fence toggles in `s` up to `end`. Odd = currently inside
// a fenced block; we can't split the prefix there or we'd orphan the fence.
// Count ``` / ~~~ AND `$$` / `\[…\]` fence toggles in `s` up to `end`. Odd
// = currently inside a fenced block; splitting the prefix there would
// orphan the fence and let the unstable suffix re-render as broken
// markdown. Math fences only toggle when the code fence is closed so
// snippets like ` ```\n$$x$$\n``` ` (math example inside a code block)
// don't double-count. A `$$x$$` line that opens AND closes on its own
// produces zero net toggles; that's `len >= 4` plus `endsDollar`.
//
// NB: this is INTENTIONALLY more conservative than `markdown.tsx`'s
// parser, which falls back to paragraph rendering when an `$$` opener
// has no matching closer. The renderer can do that safely because it
// always sees the full text on every call. The streaming chunker
// cannot — once a chunk is committed to the monotonic stable prefix it
// is frozen, so prematurely deciding "this `$$` is just prose" would
// permanently commit a paragraph rendering that becomes wrong the
// instant the closer streams in. Treating any unmatched `$$` opener
// as still-open keeps the boundary parked behind it until the closer
// arrives (or the stream ends and the non-streaming `<Md>` takes over,
// at which point the renderer's fallback kicks in correctly).
const fenceOpenAt = (s: string, end: number) => {
let open = false
let codeOpen = false
let mathOpen = false
let mathOpener: '$$' | '\\[' | null = null
let i = 0
while (i < end) {
const nl = s.indexOf('\n', i)
const lineEnd = nl < 0 || nl > end ? end : nl
const line = s.slice(i, lineEnd)
const line = s.slice(i, lineEnd).trim()
if (/^\s*(?:`{3,}|~{3,})/.test(line)) {
open = !open
if (/^(?:`{3,}|~{3,})/.test(line)) {
codeOpen = !codeOpen
} else if (!codeOpen) {
if (!mathOpen && /^\$\$/.test(line)) {
const isSingleLine = line.length >= 4 && /\$\$$/.test(line)
if (!isSingleLine) {
mathOpen = true
mathOpener = '$$'
}
} else if (!mathOpen && /^\\\[/.test(line)) {
const isSingleLine = /\\\]$/.test(line)
if (!isSingleLine) {
mathOpen = true
mathOpener = '\\['
}
} else if (mathOpen && mathOpener === '$$' && /\$\$$/.test(line)) {
mathOpen = false
mathOpener = null
} else if (mathOpen && mathOpener === '\\[' && /\\\]$/.test(line)) {
mathOpen = false
mathOpener = null
}
}
if (nl < 0 || nl >= end) {
@ -57,7 +98,7 @@ const fenceOpenAt = (s: string, end: number) => {
i = nl + 1
}
return open
return codeOpen || mathOpen
}
// Find the last "\n\n" boundary before `end` that is OUTSIDE a fenced code