mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-23 05:31:23 +00:00
fix(cli,tui): align CJK / wide-char markdown tables (#23863)
CJK and emoji glyphs render as two terminal cells but JS String#length and the model's own padding count them as one, so any markdown table with Chinese / Japanese / Korean cells drifts right per row when a real terminal renders it. Both surfaces fix this with a display-cell width measurement (wcswidth on the Python side, stringWidth on the TUI side). Changes: - agent/markdown_tables.py: new helper. realign_markdown_tables(text) detects markdown table blocks (header + |---| divider) and rewrites the row padding using wcwidth.wcswidth so every pipe and dash lines up across rows. No-op on text without tables. - cli.py: hook the helper into _render_final_assistant_content for strip / render modes (raw passes through untouched), and into the streaming line emitter so live token-by-token rendering also produces aligned tables. A small two-buffer state machine in _emit_stream_text holds table rows until the block ends, then flushes them through the realigner so all rows pad to a single per-column width. - ui-tui/src/components/markdown.tsx: renderTable now uses stringWidth (Bun.stringWidth fast path + East-Asian-width-aware fallback, already memoised in @hermes/ink) instead of UTF-16 String#length for both column-width measurement and per-cell padding. Drops the comment that documented the bug as a deliberate limitation. Validation: - New tests/agent/test_markdown_tables.py (11): every rebuilt block shares pipe column offsets across rows for pure CJK, mixed CJK+emoji, ragged-row, and multi-table inputs. - Updated tests/cli/test_cli_markdown_rendering.py: the existing strip-mode test asserted exact whitespace; rewritten to assert the alignment contract (cell content survives + every rendered row shares pipe offsets). - New ui-tui markdown.test.ts case (1): rendered column-2 start offset is identical for the header + every body row, including the CJK row that drifted before the fix. - Live: hermes chat -q with the user-reported screenshot prompt now produces a perfectly aligned table on the wire (header, divider, 4 body rows including '通义千问', all pipes at identical columns).
This commit is contained in:
parent
657874460f
commit
1d00716754
6 changed files with 559 additions and 15 deletions
170
agent/markdown_tables.py
Normal file
170
agent/markdown_tables.py
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
"""CJK/wide-character-aware re-alignment of model-emitted markdown tables.
|
||||||
|
|
||||||
|
Models pad markdown tables assuming each character occupies one terminal
|
||||||
|
cell. CJK glyphs and most emoji render as two cells, so the model's
|
||||||
|
spacing collapses into drift the moment a table reaches a real terminal —
|
||||||
|
header pipes line up, every body row drifts right by N cells per CJK
|
||||||
|
char.
|
||||||
|
|
||||||
|
This module rebuilds row padding using ``wcwidth.wcswidth`` (display
|
||||||
|
columns), preserving the table's pipes and dashes so it still reads as a
|
||||||
|
plain-text table in ``strip`` / unrendered display modes. Standard Rich
|
||||||
|
markdown rendering already aligns CJK correctly inside a wide enough
|
||||||
|
panel; this helper is for the paths that print the model's text more or
|
||||||
|
less verbatim.
|
||||||
|
|
||||||
|
The helper is deliberately conservative:
|
||||||
|
|
||||||
|
* Only contiguous ``| ... |`` blocks with a divider line are rewritten.
|
||||||
|
* Anything that does not look like a table is passed through unchanged.
|
||||||
|
* Single-line / mid-stream fragments are left alone — callers buffer
|
||||||
|
table rows and flush them once the block is complete.
|
||||||
|
|
||||||
|
There is a small, intentional caveat: ``wcwidth`` returns ``-1`` for some
|
||||||
|
emoji-with-variation-selector sequences (e.g. ``⚠️``); we clamp those to
|
||||||
|
0 so they do not corrupt the column width math. The 1-cell drift on
|
||||||
|
those specific glyphs is preferable to silently widening every table
|
||||||
|
that contains one.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from wcwidth import wcswidth
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"is_table_divider",
|
||||||
|
"looks_like_table_row",
|
||||||
|
"realign_markdown_tables",
|
||||||
|
"split_table_row",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
_DIVIDER_CELL_RE = re.compile(r"^\s*:?-{3,}:?\s*$")
|
||||||
|
_MIN_COL_WIDTH = 3 # matches the divider's minimum dash run.
|
||||||
|
|
||||||
|
|
||||||
|
def _disp_width(s: str) -> int:
|
||||||
|
"""``wcswidth`` clamped to a non-negative integer.
|
||||||
|
|
||||||
|
``wcswidth`` returns ``-1`` when it encounters a control char or an
|
||||||
|
unknown sequence; treat those as zero-width rather than letting a
|
||||||
|
negative number flow into ``max`` and break the column-width math.
|
||||||
|
"""
|
||||||
|
|
||||||
|
w = wcswidth(s)
|
||||||
|
return w if w > 0 else 0
|
||||||
|
|
||||||
|
|
||||||
|
def _pad_to_width(s: str, target: int) -> str:
|
||||||
|
return s + " " * max(0, target - _disp_width(s))
|
||||||
|
|
||||||
|
|
||||||
|
def split_table_row(row: str) -> List[str]:
|
||||||
|
"""Split ``| a | b | c |`` into ``["a", "b", "c"]`` with trims."""
|
||||||
|
|
||||||
|
s = row.strip()
|
||||||
|
if s.startswith("|"):
|
||||||
|
s = s[1:]
|
||||||
|
if s.endswith("|"):
|
||||||
|
s = s[:-1]
|
||||||
|
return [c.strip() for c in s.split("|")]
|
||||||
|
|
||||||
|
|
||||||
|
def is_table_divider(row: str) -> bool:
|
||||||
|
"""True when ``row`` is a markdown table separator line."""
|
||||||
|
|
||||||
|
cells = split_table_row(row)
|
||||||
|
return len(cells) > 1 and all(_DIVIDER_CELL_RE.match(c) for c in cells)
|
||||||
|
|
||||||
|
|
||||||
|
def looks_like_table_row(row: str) -> bool:
|
||||||
|
"""True when ``row`` could plausibly be a markdown table row.
|
||||||
|
|
||||||
|
Used by streaming callers to decide whether to buffer an in-flight
|
||||||
|
line. We are intentionally permissive here — the realigner itself
|
||||||
|
only rewrites blocks that are accompanied by a divider, so a false
|
||||||
|
positive here at most delays the print of one line.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "|" not in row:
|
||||||
|
return False
|
||||||
|
stripped = row.strip()
|
||||||
|
if not stripped:
|
||||||
|
return False
|
||||||
|
# A leading pipe is the strongest signal; without it we still allow
|
||||||
|
# rows with at least two pipes so models that omit the leading pipe
|
||||||
|
# don't slip past us.
|
||||||
|
if stripped.startswith("|"):
|
||||||
|
return True
|
||||||
|
return stripped.count("|") >= 2
|
||||||
|
|
||||||
|
|
||||||
|
def _render_block(rows: List[List[str]]) -> List[str]:
|
||||||
|
"""Render ``rows`` (header + body, divider implied) at uniform widths."""
|
||||||
|
|
||||||
|
ncols = max(len(r) for r in rows)
|
||||||
|
rows = [r + [""] * (ncols - len(r)) for r in rows]
|
||||||
|
|
||||||
|
widths = [
|
||||||
|
max(_MIN_COL_WIDTH, *(_disp_width(r[c]) for r in rows))
|
||||||
|
for c in range(ncols)
|
||||||
|
]
|
||||||
|
|
||||||
|
def _row(cells: List[str]) -> str:
|
||||||
|
return (
|
||||||
|
"| "
|
||||||
|
+ " | ".join(_pad_to_width(c, widths[k]) for k, c in enumerate(cells))
|
||||||
|
+ " |"
|
||||||
|
)
|
||||||
|
|
||||||
|
out = [_row(rows[0])]
|
||||||
|
out.append("|" + "|".join("-" * (w + 2) for w in widths) + "|")
|
||||||
|
for r in rows[1:]:
|
||||||
|
out.append(_row(r))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def realign_markdown_tables(text: str) -> str:
|
||||||
|
"""Rewrite every ``| ... |`` + divider block with wcwidth-aware padding.
|
||||||
|
|
||||||
|
Lines that are not part of a recognised table are returned verbatim,
|
||||||
|
so this is safe to apply to arbitrary assistant prose.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "|" not in text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
lines = text.split("\n")
|
||||||
|
out: List[str] = []
|
||||||
|
i = 0
|
||||||
|
n = len(lines)
|
||||||
|
|
||||||
|
while i < n:
|
||||||
|
line = lines[i]
|
||||||
|
# A table starts with a header row whose next line is a divider.
|
||||||
|
if (
|
||||||
|
"|" in line
|
||||||
|
and i + 1 < n
|
||||||
|
and is_table_divider(lines[i + 1])
|
||||||
|
):
|
||||||
|
header = split_table_row(line)
|
||||||
|
body: List[List[str]] = []
|
||||||
|
j = i + 2
|
||||||
|
while j < n and "|" in lines[j] and lines[j].strip():
|
||||||
|
if is_table_divider(lines[j]):
|
||||||
|
j += 1
|
||||||
|
continue
|
||||||
|
body.append(split_table_row(lines[j]))
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
if any(c for c in header) or body:
|
||||||
|
out.extend(_render_block([header] + body))
|
||||||
|
i = j
|
||||||
|
continue
|
||||||
|
out.append(line)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "\n".join(out)
|
||||||
94
cli.py
94
cli.py
|
|
@ -87,6 +87,11 @@ from agent.usage_pricing import (
|
||||||
format_duration_compact,
|
format_duration_compact,
|
||||||
format_token_count_compact,
|
format_token_count_compact,
|
||||||
)
|
)
|
||||||
|
from agent.markdown_tables import (
|
||||||
|
is_table_divider,
|
||||||
|
looks_like_table_row,
|
||||||
|
realign_markdown_tables,
|
||||||
|
)
|
||||||
# NOTE: `from agent.account_usage import ...` is deliberately NOT at module
|
# NOTE: `from agent.account_usage import ...` is deliberately NOT at module
|
||||||
# top — it transitively pulls the OpenAI SDK chain (~230 ms cold) and is only
|
# top — it transitively pulls the OpenAI SDK chain (~230 ms cold) and is only
|
||||||
# needed when the user runs `/limits`. Lazy-imported inside the handler below.
|
# needed when the user runs `/limits`. Lazy-imported inside the handler below.
|
||||||
|
|
@ -1355,12 +1360,21 @@ def _render_final_assistant_content(text: str, mode: str = "render"):
|
||||||
|
|
||||||
normalized_mode = str(mode or "render").strip().lower()
|
normalized_mode = str(mode or "render").strip().lower()
|
||||||
if normalized_mode == "strip":
|
if normalized_mode == "strip":
|
||||||
return _RichText(_strip_markdown_syntax(text))
|
# Strip first — inline markdown inside cells (`code`, **bold**, ~~strike~~)
|
||||||
|
# changes cell display width — then re-align so the column padding
|
||||||
|
# reflects the final visible text, not the marker-decorated source.
|
||||||
|
return _RichText(realign_markdown_tables(_strip_markdown_syntax(text)))
|
||||||
if normalized_mode == "raw":
|
if normalized_mode == "raw":
|
||||||
return _rich_text_from_ansi(text or "")
|
return _rich_text_from_ansi(text or "")
|
||||||
|
|
||||||
|
# `render` mode: Rich's Markdown renderer handles CJK width via wcwidth
|
||||||
|
# internally, so a pre-pass through realign_markdown_tables would just
|
||||||
|
# rewrite already-correct padding. But on the way in we still want to
|
||||||
|
# normalise model-emitted under-padded tables so that mid-render fallbacks
|
||||||
|
# (narrow panels, etc.) at least see consistent input.
|
||||||
plain = _rich_text_from_ansi(text or "").plain
|
plain = _rich_text_from_ansi(text or "").plain
|
||||||
plain = _preserve_windows_dot_segments_for_markdown(plain)
|
plain = _preserve_windows_dot_segments_for_markdown(plain)
|
||||||
|
plain = realign_markdown_tables(plain)
|
||||||
return Markdown(plain)
|
return Markdown(plain)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -2331,6 +2345,12 @@ class HermesCLI:
|
||||||
self._stream_started = False # True once first delta arrives
|
self._stream_started = False # True once first delta arrives
|
||||||
self._stream_box_opened = False # True once the response box header is printed
|
self._stream_box_opened = False # True once the response box header is printed
|
||||||
self._reasoning_preview_buf = "" # Coalesce tiny reasoning chunks for [thinking] output
|
self._reasoning_preview_buf = "" # Coalesce tiny reasoning chunks for [thinking] output
|
||||||
|
# Table-row buffer. When a streamed line looks like it could be
|
||||||
|
# part of a markdown table, hold it here until the block ends so
|
||||||
|
# we can re-pad with wcwidth-aware widths. Empty by default;
|
||||||
|
# populated only while `_in_stream_table` is True.
|
||||||
|
self._stream_table_buf: list[str] = []
|
||||||
|
self._in_stream_table = False
|
||||||
self._pending_edit_snapshots = {}
|
self._pending_edit_snapshots = {}
|
||||||
self._last_input_mode_recovery = 0.0
|
self._last_input_mode_recovery = 0.0
|
||||||
self._input_mode_recovery_notice_shown = False
|
self._input_mode_recovery_notice_shown = False
|
||||||
|
|
@ -3624,11 +3644,51 @@ class HermesCLI:
|
||||||
|
|
||||||
# Emit complete lines, keep partial remainder in buffer
|
# Emit complete lines, keep partial remainder in buffer
|
||||||
_tc = getattr(self, "_stream_text_ansi", "")
|
_tc = getattr(self, "_stream_text_ansi", "")
|
||||||
|
|
||||||
|
def _emit_one(printed_line: str) -> None:
|
||||||
|
_cprint(f"{_STREAM_PAD}{_tc}{printed_line}{_RST}" if _tc else f"{_STREAM_PAD}{printed_line}")
|
||||||
|
|
||||||
|
def _flush_table_buf() -> None:
|
||||||
|
buf = self._stream_table_buf
|
||||||
|
self._stream_table_buf = []
|
||||||
|
self._in_stream_table = False
|
||||||
|
if not buf:
|
||||||
|
return
|
||||||
|
# Strip cell-level markdown (`code`, **bold**, ~~strike~~) FIRST
|
||||||
|
# so the realigner pads to the final visible cell width, not
|
||||||
|
# the marker-decorated source width. Otherwise a body row
|
||||||
|
# like `` | Bold | `**bold**` | `` lands narrower than its
|
||||||
|
# header column once the markers are removed.
|
||||||
|
joined = "\n".join(buf)
|
||||||
|
if self.final_response_markdown == "strip":
|
||||||
|
joined = _strip_markdown_syntax(joined)
|
||||||
|
block = realign_markdown_tables(joined)
|
||||||
|
for ln in block.split("\n"):
|
||||||
|
_emit_one(ln)
|
||||||
|
|
||||||
while "\n" in self._stream_buf:
|
while "\n" in self._stream_buf:
|
||||||
line, self._stream_buf = self._stream_buf.split("\n", 1)
|
line, self._stream_buf = self._stream_buf.split("\n", 1)
|
||||||
|
|
||||||
|
# Hold table-shaped lines in a side-buffer so we can re-pad
|
||||||
|
# the whole block once it ends. Streaming line-by-line, we
|
||||||
|
# cannot re-align mid-table without reflowing already-printed
|
||||||
|
# rows; the cost is that the user sees the table appear in a
|
||||||
|
# single batch when the block closes instead of row-by-row.
|
||||||
|
if self._in_stream_table:
|
||||||
|
if looks_like_table_row(line) or is_table_divider(line):
|
||||||
|
self._stream_table_buf.append(line)
|
||||||
|
continue
|
||||||
|
# Block ended — flush the realigned table, then fall
|
||||||
|
# through to print the current (non-table) line.
|
||||||
|
_flush_table_buf()
|
||||||
|
elif looks_like_table_row(line):
|
||||||
|
self._stream_table_buf.append(line)
|
||||||
|
self._in_stream_table = True
|
||||||
|
continue
|
||||||
|
|
||||||
if self.final_response_markdown == "strip":
|
if self.final_response_markdown == "strip":
|
||||||
line = _strip_markdown_syntax(line)
|
line = _strip_markdown_syntax(line)
|
||||||
_cprint(f"{_STREAM_PAD}{_tc}{line}{_RST}" if _tc else f"{_STREAM_PAD}{line}")
|
_emit_one(line)
|
||||||
|
|
||||||
def _flush_stream(self) -> None:
|
def _flush_stream(self) -> None:
|
||||||
"""Emit any remaining partial line from the stream buffer and close the box."""
|
"""Emit any remaining partial line from the stream buffer and close the box."""
|
||||||
|
|
@ -3643,8 +3703,34 @@ class HermesCLI:
|
||||||
# Close reasoning box if still open (in case no content tokens arrived)
|
# Close reasoning box if still open (in case no content tokens arrived)
|
||||||
self._close_reasoning_box()
|
self._close_reasoning_box()
|
||||||
|
|
||||||
|
_tc = getattr(self, "_stream_text_ansi", "")
|
||||||
|
|
||||||
|
# If the stream buffer has a trailing partial line that looks like
|
||||||
|
# a table row, fold it into the table buffer so the whole block
|
||||||
|
# gets re-aligned together. Otherwise the final row prints raw
|
||||||
|
# (with the model's original under-padded spacing) while the rows
|
||||||
|
# above it are aligned.
|
||||||
|
if (
|
||||||
|
self._stream_buf
|
||||||
|
and getattr(self, "_in_stream_table", False)
|
||||||
|
and (looks_like_table_row(self._stream_buf) or is_table_divider(self._stream_buf))
|
||||||
|
):
|
||||||
|
self._stream_table_buf.append(self._stream_buf)
|
||||||
|
self._stream_buf = ""
|
||||||
|
|
||||||
|
# Flush any buffered table rows first so their padding is
|
||||||
|
# finalised before the stream remainder lands.
|
||||||
|
if getattr(self, "_stream_table_buf", None):
|
||||||
|
joined = "\n".join(self._stream_table_buf)
|
||||||
|
self._stream_table_buf = []
|
||||||
|
self._in_stream_table = False
|
||||||
|
if self.final_response_markdown == "strip":
|
||||||
|
joined = _strip_markdown_syntax(joined)
|
||||||
|
block = realign_markdown_tables(joined)
|
||||||
|
for ln in block.split("\n"):
|
||||||
|
_cprint(f"{_STREAM_PAD}{_tc}{ln}{_RST}" if _tc else f"{_STREAM_PAD}{ln}")
|
||||||
|
|
||||||
if self._stream_buf:
|
if self._stream_buf:
|
||||||
_tc = getattr(self, "_stream_text_ansi", "")
|
|
||||||
line = _strip_markdown_syntax(self._stream_buf) if self.final_response_markdown == "strip" else self._stream_buf
|
line = _strip_markdown_syntax(self._stream_buf) if self.final_response_markdown == "strip" else self._stream_buf
|
||||||
_cprint(f"{_STREAM_PAD}{_tc}{line}{_RST}" if _tc else f"{_STREAM_PAD}{line}")
|
_cprint(f"{_STREAM_PAD}{_tc}{line}{_RST}" if _tc else f"{_STREAM_PAD}{line}")
|
||||||
self._stream_buf = ""
|
self._stream_buf = ""
|
||||||
|
|
@ -3667,6 +3753,8 @@ class HermesCLI:
|
||||||
self._reasoning_buf = ""
|
self._reasoning_buf = ""
|
||||||
self._reasoning_preview_buf = ""
|
self._reasoning_preview_buf = ""
|
||||||
self._deferred_content = ""
|
self._deferred_content = ""
|
||||||
|
self._stream_table_buf = []
|
||||||
|
self._in_stream_table = False
|
||||||
|
|
||||||
def _slow_command_status(self, command: str) -> str:
|
def _slow_command_status(self, command: str) -> str:
|
||||||
"""Return a user-facing status message for slower slash commands."""
|
"""Return a user-facing status message for slower slash commands."""
|
||||||
|
|
|
||||||
210
tests/agent/test_markdown_tables.py
Normal file
210
tests/agent/test_markdown_tables.py
Normal file
|
|
@ -0,0 +1,210 @@
|
||||||
|
"""Tests for `agent.markdown_tables.realign_markdown_tables`.
|
||||||
|
|
||||||
|
These cover the alignment guarantee on CJK / wide-character tables and
|
||||||
|
the conservative no-op behaviour on non-table input.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from textwrap import dedent
|
||||||
|
|
||||||
|
from wcwidth import wcswidth
|
||||||
|
|
||||||
|
from agent.markdown_tables import (
|
||||||
|
is_table_divider,
|
||||||
|
looks_like_table_row,
|
||||||
|
realign_markdown_tables,
|
||||||
|
split_table_row,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _column_offsets(line: str) -> list[int]:
|
||||||
|
"""Return the display-cell index of every ``|`` in ``line``."""
|
||||||
|
|
||||||
|
cells: list[int] = []
|
||||||
|
width = 0
|
||||||
|
for ch in line:
|
||||||
|
if ch == "|":
|
||||||
|
cells.append(width)
|
||||||
|
# wcswidth on a single char; clamp negatives.
|
||||||
|
w = wcswidth(ch)
|
||||||
|
width += w if w > 0 else 1
|
||||||
|
return cells
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# split_table_row / is_table_divider / looks_like_table_row
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_split_strips_outer_pipes_and_trims():
|
||||||
|
assert split_table_row("| a | b | c |") == ["a", "b", "c"]
|
||||||
|
assert split_table_row("|配置|状态|") == ["配置", "状态"]
|
||||||
|
assert split_table_row("a | b | c") == ["a", "b", "c"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_table_divider_handles_alignment_colons():
|
||||||
|
assert is_table_divider("|---|---|")
|
||||||
|
assert is_table_divider("| :--- | ---: | :---: |")
|
||||||
|
assert not is_table_divider("| - | - |") # 1 dash is not a divider
|
||||||
|
assert not is_table_divider("| a | b |")
|
||||||
|
assert not is_table_divider("---") # single column, no pipes
|
||||||
|
|
||||||
|
|
||||||
|
def test_looks_like_table_row():
|
||||||
|
assert looks_like_table_row("| a | b |")
|
||||||
|
assert looks_like_table_row("a | b | c") # no leading pipe, ≥2 pipes
|
||||||
|
assert not looks_like_table_row("not a table")
|
||||||
|
assert not looks_like_table_row("a | b") # one pipe, no leading pipe
|
||||||
|
assert not looks_like_table_row("")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# realign_markdown_tables
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_op_on_text_without_tables():
|
||||||
|
text = "Hello world\nThis has no | pipes table.\n"
|
||||||
|
assert realign_markdown_tables(text) == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_op_when_pipes_but_no_divider():
|
||||||
|
text = "echo a | grep b\necho c | wc -l\n"
|
||||||
|
assert realign_markdown_tables(text) == text
|
||||||
|
|
||||||
|
|
||||||
|
def test_cjk_table_pipes_align_across_rows():
|
||||||
|
# Model-emitted (under-padded for CJK) input.
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
| 配置 | Config | 论文 (%) | 复现 (%) | 差值 | 状态 |
|
||||||
|
|------|--------|---------|---------|------|------|
|
||||||
|
| Vicuna (report) | dense | 79.30 | 未完成 | - | × |
|
||||||
|
| ChatGLM | chat | 37.60 | 37.82 | +0.22 | ✓ |
|
||||||
|
| 通义千问 | qwen | (无) | 报错 | - | × |
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
out = realign_markdown_tables(src).rstrip("\n").split("\n")
|
||||||
|
|
||||||
|
# All rows in the rebuilt block must have pipes at identical display
|
||||||
|
# columns — that's the alignment guarantee.
|
||||||
|
offsets = [_column_offsets(row) for row in out]
|
||||||
|
assert all(o == offsets[0] for o in offsets), (
|
||||||
|
"rebuilt table rows do not share pipe column offsets:\n"
|
||||||
|
+ "\n".join(out)
|
||||||
|
)
|
||||||
|
# And we expect 7 pipes per row (6 columns + outer borders).
|
||||||
|
assert len(offsets[0]) == 7
|
||||||
|
|
||||||
|
|
||||||
|
def test_emoji_with_cjk_table_aligns():
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
| 模型 | 状态 | 备注 |
|
||||||
|
|------|------|------|
|
||||||
|
| 千问 | ✅ | 通过 |
|
||||||
|
| Claude | ✅ | 推理强 |
|
||||||
|
| 文心一言 | ❌ | 报错 |
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
out = realign_markdown_tables(src).rstrip("\n").split("\n")
|
||||||
|
offsets = [_column_offsets(row) for row in out]
|
||||||
|
# The emoji-with-variation-selector case (⚠️) intentionally tolerates
|
||||||
|
# 1-cell drift; bare emoji like ✅ / ❌ have stable wcwidth and must
|
||||||
|
# align. Use bare emoji here so the assertion is hard.
|
||||||
|
assert all(o == offsets[0] for o in offsets), (
|
||||||
|
"emoji+CJK rows do not share pipe column offsets:\n" + "\n".join(out)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_already_aligned_ascii_table_remains_aligned():
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
| a | b |
|
||||||
|
|-----|-----|
|
||||||
|
| 1 | 2 |
|
||||||
|
| foo | bar |
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
out = realign_markdown_tables(src).rstrip("\n").split("\n")
|
||||||
|
offsets = [_column_offsets(row) for row in out]
|
||||||
|
assert all(o == offsets[0] for o in offsets)
|
||||||
|
|
||||||
|
|
||||||
|
def test_passes_non_table_lines_through_around_a_table():
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
Here is a comparison:
|
||||||
|
|
||||||
|
| 模型 | 状态 |
|
||||||
|
|------|------|
|
||||||
|
| 千问 | 通过 |
|
||||||
|
|
||||||
|
And some prose after.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
out = realign_markdown_tables(src)
|
||||||
|
assert out.startswith("Here is a comparison:\n")
|
||||||
|
assert out.endswith("And some prose after.\n")
|
||||||
|
# And the table lines are aligned.
|
||||||
|
block = [ln for ln in out.split("\n") if "|" in ln]
|
||||||
|
offsets = [_column_offsets(row) for row in block]
|
||||||
|
assert all(o == offsets[0] for o in offsets)
|
||||||
|
|
||||||
|
|
||||||
|
def test_handles_ragged_rows_by_padding_short_rows():
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
| a | b | c |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | 2 |
|
||||||
|
| x | y | z |
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
out = realign_markdown_tables(src).rstrip("\n").split("\n")
|
||||||
|
offsets = [_column_offsets(row) for row in out]
|
||||||
|
# Short rows must be padded out so they have the same pipe count
|
||||||
|
# and column positions as the header.
|
||||||
|
assert all(len(o) == len(offsets[0]) for o in offsets)
|
||||||
|
assert all(o == offsets[0] for o in offsets)
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_tables_in_one_text():
|
||||||
|
src = dedent(
|
||||||
|
"""\
|
||||||
|
First:
|
||||||
|
|
||||||
|
| 配置 | 值 |
|
||||||
|
|------|----|
|
||||||
|
| 通义 | 1 |
|
||||||
|
|
||||||
|
Second:
|
||||||
|
|
||||||
|
| model | n |
|
||||||
|
|-------|---|
|
||||||
|
| gpt | 2 |
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
out = realign_markdown_tables(src)
|
||||||
|
# Each table block individually aligns.
|
||||||
|
blocks: list[list[str]] = []
|
||||||
|
current: list[str] = []
|
||||||
|
for line in out.split("\n"):
|
||||||
|
if "|" in line:
|
||||||
|
current.append(line)
|
||||||
|
elif current:
|
||||||
|
blocks.append(current)
|
||||||
|
current = []
|
||||||
|
if current:
|
||||||
|
blocks.append(current)
|
||||||
|
|
||||||
|
assert len(blocks) == 2
|
||||||
|
for block in blocks:
|
||||||
|
offsets = [_column_offsets(row) for row in block]
|
||||||
|
assert all(o == offsets[0] for o in offsets), (
|
||||||
|
f"block did not align:\n" + "\n".join(block)
|
||||||
|
)
|
||||||
|
|
@ -118,14 +118,37 @@ def test_strip_mode_preserves_table_structure_while_cleaning_cell_markdown():
|
||||||
)
|
)
|
||||||
|
|
||||||
output = _render_to_text(renderable)
|
output = _render_to_text(renderable)
|
||||||
assert "| Syntax | Example |" in output
|
|
||||||
assert "|---|---|" in output
|
# Inline cell markdown is stripped (the contract this test enforces).
|
||||||
assert "| Bold | bold |" in output
|
|
||||||
assert "| Strike | strike |" in output
|
|
||||||
assert "**" not in output
|
assert "**" not in output
|
||||||
assert "~~" not in output
|
assert "~~" not in output
|
||||||
assert "`" not in output
|
assert "`" not in output
|
||||||
|
|
||||||
|
# Cell *content* survives, even if the surrounding whitespace was
|
||||||
|
# rewritten by the wcwidth-aware re-aligner. Asserting on bare
|
||||||
|
# cell text keeps this test focused on the strip behaviour rather
|
||||||
|
# than snapshotting incidental column padding (which is what the
|
||||||
|
# CJK-alignment fix changes).
|
||||||
|
assert "Syntax" in output
|
||||||
|
assert "Example" in output
|
||||||
|
assert "Bold" in output and "bold" in output
|
||||||
|
assert "Strike" in output and "strike" in output
|
||||||
|
|
||||||
|
# Structural sanity: the table still renders as pipe-bordered rows
|
||||||
|
# (header + divider + 2 body rows).
|
||||||
|
body_rows = [ln for ln in output.splitlines() if ln.strip().startswith("|")]
|
||||||
|
assert len(body_rows) == 4
|
||||||
|
|
||||||
|
# Every rendered table row shares the same pipe column offsets — the
|
||||||
|
# alignment guarantee from realign_markdown_tables.
|
||||||
|
pipe_cols = [
|
||||||
|
[i for i, ch in enumerate(row) if ch == "|"] for row in body_rows
|
||||||
|
]
|
||||||
|
assert all(p == pipe_cols[0] for p in pipe_cols), (
|
||||||
|
"table rows misaligned after strip-mode rendering:\n"
|
||||||
|
+ "\n".join(body_rows)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_final_assistant_content_can_leave_markdown_raw():
|
def test_final_assistant_content_can_leave_markdown_raw():
|
||||||
renderable = _render_final_assistant_content("***Bold italic***", mode="raw")
|
renderable = _render_final_assistant_content("***Bold italic***", mode="raw")
|
||||||
|
|
|
||||||
|
|
@ -217,3 +217,50 @@ describe('Md wrapping', () => {
|
||||||
expect(lines.some(line => line.startsWith(' hi ok'))).toBe(true)
|
expect(lines.some(line => line.startsWith(' hi ok'))).toBe(true)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|
||||||
|
describe('renderTable CJK width alignment', () => {
|
||||||
|
it('column starts share the same display offset across CJK rows', async () => {
|
||||||
|
const { stringWidth } = await import('@hermes/ink')
|
||||||
|
|
||||||
|
const md = [
|
||||||
|
'| 配置 | Config | 状态 |',
|
||||||
|
'|------|--------|------|',
|
||||||
|
'| Vicuna (report) | dense | × |',
|
||||||
|
'| ChatGLM | chat | ✓ |',
|
||||||
|
'| 通义千问 | qwen | × |'
|
||||||
|
].join('\n')
|
||||||
|
|
||||||
|
// Pre-fix bug: ` `.repeat(w - stripInlineMarkup(...).length) used
|
||||||
|
// UTF-16 code units, so a CJK header cell padded to 2 cells while
|
||||||
|
// the body cell padded to 4, drifting subsequent columns by 2
|
||||||
|
// cells per CJK char.
|
||||||
|
//
|
||||||
|
// Post-fix contract: the prefix preceding the start of column N
|
||||||
|
// has the same display width across the header and every body row
|
||||||
|
// (deduped to skip the divider, which renders independently).
|
||||||
|
const lines = renderPlain(
|
||||||
|
React.createElement(Box, null, React.createElement(Md, { compact: true, t: DEFAULT_THEME, text: md }))
|
||||||
|
).filter(line => line.trim().length > 0)
|
||||||
|
|
||||||
|
// Heuristic: a "data row" line either contains 'Config' (header)
|
||||||
|
// or one of the body labels; a divider is all box-drawing. Use
|
||||||
|
// the substring 'Config' / 'dense' / 'chat' / 'qwen' as the
|
||||||
|
// unique anchor for column 2's start position on each row.
|
||||||
|
const colStarts = (line: string, anchor: string): number => {
|
||||||
|
const idx = line.indexOf(anchor)
|
||||||
|
return idx < 0 ? -1 : stringWidth(line.slice(0, idx))
|
||||||
|
}
|
||||||
|
|
||||||
|
const headerCol2 = lines.map(l => colStarts(l, 'Config')).find(v => v >= 0)
|
||||||
|
const denseCol2 = lines.map(l => colStarts(l, 'dense')).find(v => v >= 0)
|
||||||
|
const chatCol2 = lines.map(l => colStarts(l, 'chat')).find(v => v >= 0)
|
||||||
|
const qwenCol2 = lines.map(l => colStarts(l, 'qwen')).find(v => v >= 0)
|
||||||
|
|
||||||
|
expect(headerCol2).toBeDefined()
|
||||||
|
expect(denseCol2).toBe(headerCol2)
|
||||||
|
expect(chatCol2).toBe(headerCol2)
|
||||||
|
// The CJK row is the one that drifted before the fix. It must
|
||||||
|
// align with the rest now.
|
||||||
|
expect(qwenCol2).toBe(headerCol2)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
import { Box, Link, Text } from '@hermes/ink'
|
import { Box, Link, stringWidth, Text } from '@hermes/ink'
|
||||||
import { Fragment, memo, type ReactNode, useMemo } from 'react'
|
import { Fragment, memo, type ReactNode, useMemo } from 'react'
|
||||||
|
|
||||||
import { ensureEmojiPresentation } from '../lib/emoji.js'
|
import { ensureEmojiPresentation } from '../lib/emoji.js'
|
||||||
|
|
@ -170,16 +170,22 @@ export const stripInlineMarkup = (v: string) =>
|
||||||
.replace(/\\\(([^\n]+?)\\\)/g, '$1')
|
.replace(/\\\(([^\n]+?)\\\)/g, '$1')
|
||||||
|
|
||||||
const renderTable = (k: number, rows: string[][], t: Theme) => {
|
const renderTable = (k: number, rows: string[][], t: Theme) => {
|
||||||
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => stripInlineMarkup(r[ci] ?? '').length)))
|
// Column widths in *display cells*, not UTF-16 code units. CJK
|
||||||
|
// glyphs and most emoji render as two cells but `String#length`
|
||||||
|
// counts them as one, which collapses Chinese / Japanese / Korean
|
||||||
|
// tables into drift across rows. `stringWidth` (Bun.stringWidth
|
||||||
|
// fast path + an East-Asian-width-aware fallback, memoised in
|
||||||
|
// @hermes/ink) returns the actual cell count.
|
||||||
|
const cellWidth = (raw: string) => stringWidth(stripInlineMarkup(raw))
|
||||||
|
|
||||||
|
const widths = rows[0]!.map((_, ci) => Math.max(...rows.map(r => cellWidth(r[ci] ?? ''))))
|
||||||
|
|
||||||
// Thin divider under the header. Without it tables look like prose
|
// Thin divider under the header. Without it tables look like prose
|
||||||
// with extra spacing because the header is just accent-coloured text
|
// with extra spacing because the header is just accent-coloured text
|
||||||
// (#15534). We avoid full borders on purpose — column widths come
|
// (#15534). We avoid full borders on purpose — column widths come
|
||||||
// from `stripInlineMarkup(...).length` (UTF-16 code units, not
|
// from `stringWidth(...)`, so the dividers and the row content stay
|
||||||
// display width), so a real outline often misaligns on emoji and
|
// in sync on CJK / emoji tables; tab-style column gaps still read
|
||||||
// East-Asian wide characters; one dim solid rule (`─`) under row 0
|
// cleanly without the boxed look.
|
||||||
// plus tab-style column gaps reads cleanly on every terminal we
|
|
||||||
// tested.
|
|
||||||
const sep = widths.map(w => '─'.repeat(Math.max(1, w))).join(' ')
|
const sep = widths.map(w => '─'.repeat(Math.max(1, w))).join(' ')
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
|
@ -190,7 +196,7 @@ const renderTable = (k: number, rows: string[][], t: Theme) => {
|
||||||
{widths.map((w, ci) => (
|
{widths.map((w, ci) => (
|
||||||
<Text bold={ri === 0} color={ri === 0 ? t.color.accent : undefined} key={ci}>
|
<Text bold={ri === 0} color={ri === 0 ? t.color.accent : undefined} key={ci}>
|
||||||
<MdInline t={t} text={row[ci] ?? ''} />
|
<MdInline t={t} text={row[ci] ?? ''} />
|
||||||
{' '.repeat(Math.max(0, w - stripInlineMarkup(row[ci] ?? '').length))}
|
{' '.repeat(Math.max(0, w - cellWidth(row[ci] ?? '')))}
|
||||||
{ci < widths.length - 1 ? ' ' : ''}
|
{ci < widths.length - 1 ? ' ' : ''}
|
||||||
</Text>
|
</Text>
|
||||||
))}
|
))}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue