fix(cli,tui): align CJK / wide-char markdown tables (#23863)

CJK and emoji glyphs render as two terminal cells but JS String#length
and the model's own padding count them as one, so any markdown table
with Chinese / Japanese / Korean cells drifts right per row when a
real terminal renders it. Both surfaces fix this with a display-cell
width measurement (wcswidth on the Python side, stringWidth on the
TUI side).

Changes:
- agent/markdown_tables.py: new helper. realign_markdown_tables(text)
  detects markdown table blocks (header + |---| divider) and
  rewrites the row padding using wcwidth.wcswidth so every pipe and
  dash lines up across rows. No-op on text without tables.
- cli.py: hook the helper into _render_final_assistant_content for
  strip / render modes (raw passes through untouched), and into the
  streaming line emitter so live token-by-token rendering also
  produces aligned tables. A small two-buffer state machine in
  _emit_stream_text holds table rows until the block ends, then
  flushes them through the realigner so all rows pad to a single
  per-column width.
- ui-tui/src/components/markdown.tsx: renderTable now uses
  stringWidth (Bun.stringWidth fast path + East-Asian-width-aware
  fallback, already memoised in @hermes/ink) instead of UTF-16
  String#length for both column-width measurement and per-cell
  padding. Drops the comment that documented the bug as a deliberate
  limitation.

Validation:
- New tests/agent/test_markdown_tables.py (11): every rebuilt block
  shares pipe column offsets across rows for pure CJK, mixed
  CJK+emoji, ragged-row, and multi-table inputs.
- Updated tests/cli/test_cli_markdown_rendering.py: the existing
  strip-mode test asserted exact whitespace; rewritten to assert the
  alignment contract (cell content survives + every rendered row
  shares pipe offsets).
- New ui-tui markdown.test.ts case (1): rendered column-2 start
  offset is identical for the header + every body row, including
  the CJK row that drifted before the fix.
- Live: hermes chat -q with the user-reported screenshot prompt now
  produces a perfectly aligned table on the wire (header, divider,
  4 body rows including '通义千问', all pipes at identical columns).
This commit is contained in:
Teknium 2026-05-11 11:13:06 -07:00 committed by GitHub
parent 657874460f
commit 1d00716754
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 559 additions and 15 deletions

View file

@ -217,3 +217,50 @@ describe('Md wrapping', () => {
expect(lines.some(line => line.startsWith(' hi ok'))).toBe(true)
})
})
describe('renderTable CJK width alignment', () => {
it('column starts share the same display offset across CJK rows', async () => {
const { stringWidth } = await import('@hermes/ink')
const md = [
'| 配置 | Config | 状态 |',
'|------|--------|------|',
'| Vicuna (report) | dense | × |',
'| ChatGLM | chat | ✓ |',
'| 通义千问 | qwen | × |'
].join('\n')
// Pre-fix bug: ` `.repeat(w - stripInlineMarkup(...).length) used
// UTF-16 code units, so a CJK header cell padded to 2 cells while
// the body cell padded to 4, drifting subsequent columns by 2
// cells per CJK char.
//
// Post-fix contract: the prefix preceding the start of column N
// has the same display width across the header and every body row
// (deduped to skip the divider, which renders independently).
const lines = renderPlain(
React.createElement(Box, null, React.createElement(Md, { compact: true, t: DEFAULT_THEME, text: md }))
).filter(line => line.trim().length > 0)
// Heuristic: a "data row" line either contains 'Config' (header)
// or one of the body labels; a divider is all box-drawing. Use
// the substring 'Config' / 'dense' / 'chat' / 'qwen' as the
// unique anchor for column 2's start position on each row.
const colStarts = (line: string, anchor: string): number => {
const idx = line.indexOf(anchor)
return idx < 0 ? -1 : stringWidth(line.slice(0, idx))
}
const headerCol2 = lines.map(l => colStarts(l, 'Config')).find(v => v >= 0)
const denseCol2 = lines.map(l => colStarts(l, 'dense')).find(v => v >= 0)
const chatCol2 = lines.map(l => colStarts(l, 'chat')).find(v => v >= 0)
const qwenCol2 = lines.map(l => colStarts(l, 'qwen')).find(v => v >= 0)
expect(headerCol2).toBeDefined()
expect(denseCol2).toBe(headerCol2)
expect(chatCol2).toBe(headerCol2)
// The CJK row is the one that drifted before the fix. It must
// align with the rest now.
expect(qwenCol2).toBe(headerCol2)
})
})

View file

@ -1,4 +1,4 @@
import { Box, Link, Text } from '@hermes/ink'
import { Box, Link, stringWidth, Text } from '@hermes/ink'
import { Fragment, memo, type ReactNode, useMemo } from 'react'
import { ensureEmojiPresentation } from '../lib/emoji.js'
@ -170,16 +170,22 @@ export const stripInlineMarkup = (v: string) =>
.replace(/\\\(([^\n]+?)\\\)/g, '$1')
const renderTable = (k: number, rows: string[][], t: Theme) => {
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))
// Column widths in *display cells*, not UTF-16 code units. CJK
// glyphs and most emoji render as two cells but `String#length`
// counts them as one, which collapses Chinese / Japanese / Korean
// tables into drift across rows. `stringWidth` (Bun.stringWidth
// fast path + an East-Asian-width-aware fallback, memoised in
// @hermes/ink) returns the actual cell count.
const cellWidth = (raw: string) => stringWidth(stripInlineMarkup(raw))
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => cellWidth(r[ci] ?? ''))))
// Thin divider under the header. Without it tables look like prose
// with extra spacing because the header is just accent-coloured text
// (#15534). We avoid full borders on purpose — column widths come
// from `stripInlineMarkup(...).length` (UTF-16 code units, not
// display width), so a real outline often misaligns on emoji and
// East-Asian wide characters; one dim solid rule (`─`) under row 0
// plus tab-style column gaps reads cleanly on every terminal we
// tested.
// from `stringWidth(...)`, so the dividers and the row content stay
// in sync on CJK / emoji tables; tab-style column gaps still read
// cleanly without the boxed look.
const sep = widths.map(w => '─'.repeat(Math.max(1, w))).join(' ')
return (
@ -190,7 +196,7 @@ const renderTable = (k: number, rows: string[][], t: Theme) => {
{widths.map((w, ci) => (
<Text bold={ri === 0} color={ri === 0 ? t.color.accent : undefined} key={ci}>
<MdInline t={t} text={row[ci] ?? ''} />
{' '.repeat(Math.max(0, w - stripInlineMarkup(row[ci] ?? '').length))}
{' '.repeat(Math.max(0, w - cellWidth(row[ci] ?? '')))}
{ci < widths.length - 1 ? ' ' : ''}
</Text>
))}