fix(cli,tui): align CJK / wide-char markdown tables (#23863)

CJK and emoji glyphs render as two terminal cells but JS String#length
and the model's own padding count them as one, so any markdown table
with Chinese / Japanese / Korean cells drifts right per row when a
real terminal renders it. Both surfaces fix this with a display-cell
width measurement (wcswidth on the Python side, stringWidth on the
TUI side).

Changes:
- agent/markdown_tables.py: new helper. realign_markdown_tables(text)
  detects markdown table blocks (header + |---| divider) and
  rewrites the row padding using wcwidth.wcswidth so every pipe and
  dash lines up across rows. No-op on text without tables.
- cli.py: hook the helper into _render_final_assistant_content for
  strip / render modes (raw passes through untouched), and into the
  streaming line emitter so live token-by-token rendering also
  produces aligned tables. A small two-buffer state machine in
  _emit_stream_text holds table rows until the block ends, then
  flushes them through the realigner so all rows pad to a single
  per-column width.
- ui-tui/src/components/markdown.tsx: renderTable now uses
  stringWidth (Bun.stringWidth fast path + East-Asian-width-aware
  fallback, already memoised in @hermes/ink) instead of UTF-16
  String#length for both column-width measurement and per-cell
  padding. Drops the comment that documented the bug as a deliberate
  limitation.

Validation:
- New tests/agent/test_markdown_tables.py (11): every rebuilt block
  shares pipe column offsets across rows for pure CJK, mixed
  CJK+emoji, ragged-row, and multi-table inputs.
- Updated tests/cli/test_cli_markdown_rendering.py: the existing
  strip-mode test asserted exact whitespace; rewritten to assert the
  alignment contract (cell content survives + every rendered row
  shares pipe offsets).
- New ui-tui markdown.test.ts case (1): rendered column-2 start
  offset is identical for the header + every body row, including
  the CJK row that drifted before the fix.
- Live: hermes chat -q with the user-reported screenshot prompt now
  produces a perfectly aligned table on the wire (header, divider,
  4 body rows including '通义千问', all pipes at identical columns).
This commit is contained in:
Teknium 2026-05-11 11:13:06 -07:00 committed by GitHub
parent 657874460f
commit 1d00716754
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 559 additions and 15 deletions

View file

@ -0,0 +1,210 @@
"""Tests for `agent.markdown_tables.realign_markdown_tables`.
These cover the alignment guarantee on CJK / wide-character tables and
the conservative no-op behaviour on non-table input.
"""
from __future__ import annotations
from textwrap import dedent
from wcwidth import wcswidth
from agent.markdown_tables import (
is_table_divider,
looks_like_table_row,
realign_markdown_tables,
split_table_row,
)
def _column_offsets(line: str) -> list[int]:
"""Return the display-cell index of every ``|`` in ``line``."""
cells: list[int] = []
width = 0
for ch in line:
if ch == "|":
cells.append(width)
# wcswidth on a single char; clamp negatives.
w = wcswidth(ch)
width += w if w > 0 else 1
return cells
# ---------------------------------------------------------------------------
# split_table_row / is_table_divider / looks_like_table_row
# ---------------------------------------------------------------------------
def test_split_strips_outer_pipes_and_trims():
assert split_table_row("| a | b | c |") == ["a", "b", "c"]
assert split_table_row("|配置|状态|") == ["配置", "状态"]
assert split_table_row("a | b | c") == ["a", "b", "c"]
def test_is_table_divider_handles_alignment_colons():
assert is_table_divider("|---|---|")
assert is_table_divider("| :--- | ---: | :---: |")
assert not is_table_divider("| - | - |") # 1 dash is not a divider
assert not is_table_divider("| a | b |")
assert not is_table_divider("---") # single column, no pipes
def test_looks_like_table_row():
assert looks_like_table_row("| a | b |")
assert looks_like_table_row("a | b | c") # no leading pipe, ≥2 pipes
assert not looks_like_table_row("not a table")
assert not looks_like_table_row("a | b") # one pipe, no leading pipe
assert not looks_like_table_row("")
# ---------------------------------------------------------------------------
# realign_markdown_tables
# ---------------------------------------------------------------------------
def test_no_op_on_text_without_tables():
text = "Hello world\nThis has no | pipes table.\n"
assert realign_markdown_tables(text) == text
def test_no_op_when_pipes_but_no_divider():
text = "echo a | grep b\necho c | wc -l\n"
assert realign_markdown_tables(text) == text
def test_cjk_table_pipes_align_across_rows():
# Model-emitted (under-padded for CJK) input.
src = dedent(
"""\
| 配置 | Config | 论文 (%) | 复现 (%) | 差值 | 状态 |
|------|--------|---------|---------|------|------|
| Vicuna (report) | dense | 79.30 | 未完成 | - | × |
| ChatGLM | chat | 37.60 | 37.82 | +0.22 | |
| 通义千问 | qwen | () | 报错 | - | × |
"""
)
out = realign_markdown_tables(src).rstrip("\n").split("\n")
# All rows in the rebuilt block must have pipes at identical display
# columns — that's the alignment guarantee.
offsets = [_column_offsets(row) for row in out]
assert all(o == offsets[0] for o in offsets), (
"rebuilt table rows do not share pipe column offsets:\n"
+ "\n".join(out)
)
# And we expect 7 pipes per row (6 columns + outer borders).
assert len(offsets[0]) == 7
def test_emoji_with_cjk_table_aligns():
src = dedent(
"""\
| 模型 | 状态 | 备注 |
|------|------|------|
| 千问 | | 通过 |
| Claude | | 推理强 |
| 文心一言 | | 报错 |
"""
)
out = realign_markdown_tables(src).rstrip("\n").split("\n")
offsets = [_column_offsets(row) for row in out]
# The emoji-with-variation-selector case (⚠️) intentionally tolerates
# 1-cell drift; bare emoji like ✅ / ❌ have stable wcwidth and must
# align. Use bare emoji here so the assertion is hard.
assert all(o == offsets[0] for o in offsets), (
"emoji+CJK rows do not share pipe column offsets:\n" + "\n".join(out)
)
def test_already_aligned_ascii_table_remains_aligned():
src = dedent(
"""\
| a | b |
|-----|-----|
| 1 | 2 |
| foo | bar |
"""
)
out = realign_markdown_tables(src).rstrip("\n").split("\n")
offsets = [_column_offsets(row) for row in out]
assert all(o == offsets[0] for o in offsets)
def test_passes_non_table_lines_through_around_a_table():
src = dedent(
"""\
Here is a comparison:
| 模型 | 状态 |
|------|------|
| 千问 | 通过 |
And some prose after.
"""
)
out = realign_markdown_tables(src)
assert out.startswith("Here is a comparison:\n")
assert out.endswith("And some prose after.\n")
# And the table lines are aligned.
block = [ln for ln in out.split("\n") if "|" in ln]
offsets = [_column_offsets(row) for row in block]
assert all(o == offsets[0] for o in offsets)
def test_handles_ragged_rows_by_padding_short_rows():
src = dedent(
"""\
| a | b | c |
|---|---|---|
| 1 | 2 |
| x | y | z |
"""
)
out = realign_markdown_tables(src).rstrip("\n").split("\n")
offsets = [_column_offsets(row) for row in out]
# Short rows must be padded out so they have the same pipe count
# and column positions as the header.
assert all(len(o) == len(offsets[0]) for o in offsets)
assert all(o == offsets[0] for o in offsets)
def test_multiple_tables_in_one_text():
src = dedent(
"""\
First:
| 配置 | |
|------|----|
| 通义 | 1 |
Second:
| model | n |
|-------|---|
| gpt | 2 |
"""
)
out = realign_markdown_tables(src)
# Each table block individually aligns.
blocks: list[list[str]] = []
current: list[str] = []
for line in out.split("\n"):
if "|" in line:
current.append(line)
elif current:
blocks.append(current)
current = []
if current:
blocks.append(current)
assert len(blocks) == 2
for block in blocks:
offsets = [_column_offsets(row) for row in block]
assert all(o == offsets[0] for o in offsets), (
f"block did not align:\n" + "\n".join(block)
)

View file

@ -118,14 +118,37 @@ def test_strip_mode_preserves_table_structure_while_cleaning_cell_markdown():
)
output = _render_to_text(renderable)
assert "| Syntax | Example |" in output
assert "|---|---|" in output
assert "| Bold | bold |" in output
assert "| Strike | strike |" in output
# Inline cell markdown is stripped (the contract this test enforces).
assert "**" not in output
assert "~~" not in output
assert "`" not in output
# Cell *content* survives, even if the surrounding whitespace was
# rewritten by the wcwidth-aware re-aligner. Asserting on bare
# cell text keeps this test focused on the strip behaviour rather
# than snapshotting incidental column padding (which is what the
# CJK-alignment fix changes).
assert "Syntax" in output
assert "Example" in output
assert "Bold" in output and "bold" in output
assert "Strike" in output and "strike" in output
# Structural sanity: the table still renders as pipe-bordered rows
# (header + divider + 2 body rows).
body_rows = [ln for ln in output.splitlines() if ln.strip().startswith("|")]
assert len(body_rows) == 4
# Every rendered table row shares the same pipe column offsets — the
# alignment guarantee from realign_markdown_tables.
pipe_cols = [
[i for i, ch in enumerate(row) if ch == "|"] for row in body_rows
]
assert all(p == pipe_cols[0] for p in pipe_cols), (
"table rows misaligned after strip-mode rendering:\n"
+ "\n".join(body_rows)
)
def test_final_assistant_content_can_leave_markdown_raw():
renderable = _render_final_assistant_content("***Bold italic***", mode="raw")