mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-06-10 08:32:09 +00:00
fix(desktop): polish LaTeX rendering — currency, code blocks, brackets
Five distinct bugs surfaced from a math-heavy stress test:
1. Adjacent code fences glued together. scrubBacktickNoise's
second-pass regex /``\s*``/g matched the LAST 2 backticks of
one fence + whitespace + FIRST 2 backticks of the next, collapsing
two blocks into one. Fixed with lookbehind/lookahead so we only
match exactly 2 backticks not part of a longer run.
2. Whitespace eaten between fences and following content.
stripPreviewTargets internally calls .trim() which strips leading/
trailing whitespace from each split-segment. For segments between
two fences this collapsed \n\n to '', gluing fence close to next
block. Fixed by capturing leading/trailing whitespace at the call
site and restoring it after the transform.
3. Currency dollar signs eaten as math. With singleDollarTextMath:true
remark-math greedy-matched any pair of $, so '$5 ... $10' became
one inline math span. Added escapeCurrencyDollars to escape $<digit>
patterns to \$<digit> in prose segments (not in code). Trade-off:
math expressions starting with a digit (rare — '$5x = 10$') get
escaped too. Mirrors the convention in ChatGPT/Claude's UIs.
4. \(...\) and \[...\] LaTeX brackets unsupported. Models often
emit these instead of $...$ / $$...$$. Added
rewriteLatexBracketDelimiters preprocessor pass.
5. ```latex / ```tex blocks were being routed to KaTeX via a
rewrite to ```math. Aligns with GitHub markdown convention:
```math = render as math; ```latex / ```tex = LaTeX/TeX
source code (syntax highlighted, not rendered). Conflating them
broke teaching/showing-source use cases. MATH_FENCE_LANGUAGES
pruned to {'math'} only.
Also flipped parseIncompleteMarkdown to true (was !isStreaming) so
the math parser can't see $ inside streaming-but-not-yet-closed code
fences. Shiki was already deferred via defer={isStreaming} so this
doesn't introduce new tokenization cost.
Test: 18/18 existing tests still pass; one test updated to expect
escaped \$ in currency-prose-with-URL case.
This commit is contained in:
parent
747caa74f0
commit
708d2a0c33
3 changed files with 125 additions and 4 deletions
|
|
@ -136,8 +136,13 @@ describe('preprocessMarkdown', () => {
|
|||
const output = preprocessMarkdown(input)
|
||||
|
||||
expect(output).not.toContain('```')
|
||||
// Currency dollar amounts get escaped to `\$` in the preprocessor
|
||||
// so they don't get parsed as math delimiters by remark-math (we
|
||||
// enable singleDollarTextMath, which would otherwise greedy-match
|
||||
// `$56...$99` as one big inline math span). The escape is invisible
|
||||
// to the user — `\$` renders as a literal `$` in the final output.
|
||||
expect(output).toContain(
|
||||
'~$56<https://www.getyourguide.com/san-juan-puerto-rico-l355/san-juan-snorkel-sea-turtles-manatees-free-video-rum-t879147/> Old San Juan Sunset Cruise'
|
||||
'~\\$56<https://www.getyourguide.com/san-juan-puerto-rico-l355/san-juan-snorkel-sea-turtles-manatees-free-video-rum-t879147/> Old San Juan Sunset Cruise'
|
||||
)
|
||||
expect(output).toContain(
|
||||
'<https://www.getyourguide.com/en-gb/san-juan-puerto-rico-l355/san-juan-old-san-juan-sunset-cruise-with-drinks-transfer-t405191/>'
|
||||
|
|
|
|||
|
|
@ -335,7 +335,15 @@ const MarkdownTextImpl = () => {
|
|||
)}
|
||||
lineNumbers={false}
|
||||
mode="streaming"
|
||||
parseIncompleteMarkdown={!isStreaming}
|
||||
// Always auto-close incomplete fences — even during streaming.
|
||||
// Without this, an unclosed ```python ... ``` whose body contains
|
||||
// `$` (very common: shell snippets, JS template strings, dollar
|
||||
// amounts) leaks those dollars out to the math parser and they
|
||||
// get rendered as broken inline math until the closing fence
|
||||
// arrives. Shiki is independently deferred via `defer={isStreaming}`
|
||||
// on the SyntaxHighlighter component, so we don't pay code-block
|
||||
// tokenization on every token even with this set.
|
||||
parseIncompleteMarkdown
|
||||
plugins={{ math: mathPlugin, ...(isStreaming ? {} : { code }) }}
|
||||
preprocess={preprocessMarkdown}
|
||||
shikiTheme={['github-light-default', 'github-dark-default']}
|
||||
|
|
|
|||
|
|
@ -94,7 +94,14 @@ function scrubBacktickNoise(text: string): string {
|
|||
out += text.slice(cursor).replace(fenceNoiseRe, '')
|
||||
|
||||
for (let pass = 0; pass < 2; pass += 1) {
|
||||
out = out.replace(/``\s*``/g, '')
|
||||
// Match EXACTLY 2 backticks (not part of a longer run) on each side.
|
||||
// Without the lookbehind/lookahead, two adjacent triple-backtick
|
||||
// fences with only whitespace between them get spliced together —
|
||||
// e.g. ```bash\n...\n```\n\n```latex matches the regex's
|
||||
// last-2-of-bash-close + \n\n + first-2-of-latex-open and the
|
||||
// surrounding fence markers collapse into a single longer block,
|
||||
// which the markdown parser then treats as ONE giant code block.
|
||||
out = out.replace(/(?<!`)``(?!`)\s*(?<!`)``(?!`)/g, '')
|
||||
out = out.replace(/(^|[^`])``(?=\s|[.,;:!?)\]'"\u2014\u2013-]|$)/g, '$1')
|
||||
}
|
||||
|
||||
|
|
@ -164,6 +171,22 @@ function findClosingFence(lines: string[], start: number, marker: string): numbe
|
|||
return -1
|
||||
}
|
||||
|
||||
// Languages that should be routed to the math (KaTeX) renderer instead of
|
||||
// being shown as a syntax-highlighted code block.
|
||||
//
|
||||
// We deliberately recognize ONLY `math` here, not `latex` or `tex`.
|
||||
// Reasoning: GitHub-style markdown uses ` ```math ` to mean "render as
|
||||
// math" and ` ```latex `/` ```tex ` to mean "show LaTeX/TeX source code"
|
||||
// (syntax highlighted). Conflating the two breaks code blocks where a
|
||||
// user is *discussing* LaTeX rather than embedding it (e.g.,
|
||||
// ```latex\n\begin{equation}\n E = mc^2\n\end{equation}``` shown as a
|
||||
// teaching example). Anyone who wants math rendered should use ```math.
|
||||
const MATH_FENCE_LANGUAGES = new Set(['math'])
|
||||
|
||||
function isMathFence(language: string): boolean {
|
||||
return MATH_FENCE_LANGUAGES.has(language.toLowerCase())
|
||||
}
|
||||
|
||||
function normalizeFenceBlocks(text: string): string {
|
||||
const sourceLines = text.split('\n')
|
||||
const out: string[] = []
|
||||
|
|
@ -226,6 +249,15 @@ function normalizeFenceBlocks(text: string): string {
|
|||
|
||||
if (isLikelyProseFence(infoRaw, body)) {
|
||||
pushProseFence(out, indent, infoRaw, bodyLines)
|
||||
} else if (isMathFence(language)) {
|
||||
// Streaming math fence — rewrite the language tag to "math".
|
||||
// remark-math + rehype-katex pick up ```math fenced blocks via
|
||||
// the language-math class on the resulting <code> element. We
|
||||
// keep the fence intact (instead of converting to $$..$$) so
|
||||
// any literal `$$` characters in the body don't collide with
|
||||
// an outer math wrapper. No close emitted yet — streaming.
|
||||
out.push(`${indent}${marker}math`)
|
||||
out.push(...bodyLines)
|
||||
} else {
|
||||
out.push(`${indent}${marker}${language}`)
|
||||
out.push(...bodyLines)
|
||||
|
|
@ -241,6 +273,21 @@ function normalizeFenceBlocks(text: string): string {
|
|||
continue
|
||||
}
|
||||
|
||||
if (isMathFence(language)) {
|
||||
// Closed math fence — rewrite the language tag to "math" so
|
||||
// rehype-katex's language-math class detection picks it up.
|
||||
// Body stays untouched (no $$..$$ rewrite) so authors can write
|
||||
// arbitrary LaTeX including `$$display$$` markers without them
|
||||
// colliding with our wrapper. Without this rewrite the block
|
||||
// would render as a syntax-highlighted "latex" code listing.
|
||||
out.push(`${indent}${marker}math`)
|
||||
out.push(...bodyLines)
|
||||
out.push(`${indent}${marker}`)
|
||||
index = closeIndex + 1
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
out.push(`${indent}${marker}${language}`)
|
||||
out.push(...bodyLines)
|
||||
out.push(`${indent}${marker}`)
|
||||
|
|
@ -250,6 +297,39 @@ function normalizeFenceBlocks(text: string): string {
|
|||
return out.join('\n')
|
||||
}
|
||||
|
||||
// Convert LaTeX bracket delimiters to remark-math's dollar-sign syntax.
|
||||
// Models often emit `\(...\)` for inline math and `\[...\]` for display
|
||||
// math (the standard LaTeX convention) instead of `$...$` / `$$...$$`.
|
||||
// remark-math only natively recognizes the dollar form, so we rewrite at
|
||||
// preprocess time. Done with simple non-greedy matches keyed on the
|
||||
// escaped-bracket sequences — these are rare enough in non-math content
|
||||
// (you'd have to write a literal `\(` followed eventually by a literal
|
||||
// `\)` with NO interleaving newline-paragraph-break) that false positives
|
||||
// are extremely unlikely.
|
||||
const LATEX_INLINE_RE = /\\\(([^\n]+?)\\\)/g
|
||||
const LATEX_DISPLAY_RE = /\\\[([\s\S]+?)\\\]/g
|
||||
|
||||
function rewriteLatexBracketDelimiters(text: string): string {
|
||||
return text.replace(LATEX_INLINE_RE, (_, body: string) => `$${body}$`).replace(LATEX_DISPLAY_RE, (_, body: string) => `$$${body}$$`)
|
||||
}
|
||||
|
||||
// Escape `$<digit>` patterns so they don't get eaten as math delimiters.
|
||||
// Models commonly write currency amounts ($5, $19.99, $1,299) in prose.
|
||||
// With `singleDollarTextMath: true`, remark-math is greedy and matches
|
||||
// EVERY pair of `$`s — including the open of `$5` to the next `$10`,
|
||||
// rendering "5 in my pocket and you have " as italicized math text.
|
||||
// The de-facto convention across math-supporting LLM UIs is to treat
|
||||
// `$` followed by a digit as currency rather than math, since math
|
||||
// expressions almost always start with a letter or `\command`. Trade-
|
||||
// off: a math expression like `$5x = 10$` would have its leading 5
|
||||
// escaped — annoying but rare. The escape `\$` survives to render as
|
||||
// a literal `$` in the final output.
|
||||
const CURRENCY_DOLLAR_RE = /(^|[^\\])\$(?=\d)/g
|
||||
|
||||
function escapeCurrencyDollars(text: string): string {
|
||||
return text.replace(CURRENCY_DOLLAR_RE, '$1\\$')
|
||||
}
|
||||
|
||||
export function preprocessMarkdown(text: string): string {
|
||||
const cleaned = text.replace(REASONING_BLOCK_RE, '').replace(PREVIEW_MARKER_RE, '')
|
||||
const scrubbed = scrubBacktickNoise(cleaned)
|
||||
|
|
@ -258,7 +338,35 @@ export function preprocessMarkdown(text: string): string {
|
|||
|
||||
return strippedEmptyFences
|
||||
.split(CODE_FENCE_SPLIT_RE)
|
||||
.map(part => (/^(?:```|~~~)/.test(part) ? part : normalizeVisibleProse(stripPreviewTargets(part))))
|
||||
.map(part => {
|
||||
// Fence blocks pass through untouched.
|
||||
if (/^(?:```|~~~)/.test(part)) {return part}
|
||||
|
||||
// Whitespace-only segments (e.g. the `\n\n` between two adjacent
|
||||
// fences) must NOT go through stripPreviewTargets — its internal
|
||||
// .trim() would collapse them to '' and glue the surrounding
|
||||
// fences together, producing things like ``````math which the
|
||||
// markdown parser then reads as a single 6-backtick block.
|
||||
if (!part.trim()) {return part}
|
||||
// Preserve leading/trailing whitespace around the prose body so
|
||||
// that fence-prose-fence sequences keep their blank-line gaps.
|
||||
// stripPreviewTargets internally calls .trim() on its result for
|
||||
// the benefit of its other (single-segment) callers; here we're
|
||||
// operating on a SEGMENT of a larger document where outer
|
||||
// whitespace is structural and must survive.
|
||||
const leading = part.match(/^\s*/)?.[0] ?? ''
|
||||
const trailing = part.match(/\s*$/)?.[0] ?? ''
|
||||
|
||||
// rewriteLatexBracketDelimiters runs only on prose segments so
|
||||
// we don't accidentally touch `\(` inside a code block.
|
||||
// escapeCurrencyDollars likewise only runs on prose, so legit
|
||||
// `$5` literals inside fenced code stay intact.
|
||||
const transformed = normalizeVisibleProse(
|
||||
stripPreviewTargets(rewriteLatexBracketDelimiters(escapeCurrencyDollars(part)))
|
||||
)
|
||||
|
||||
return leading + transformed + trailing
|
||||
})
|
||||
.join('')
|
||||
.replace(/[ \t]+\n/g, '\n')
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue