mirror of
https://github.com/NousResearch/hermes-agent.git
synced 2026-05-18 04:41:56 +00:00
fix(cli,tui): align CJK / wide-char markdown tables (#23863)
CJK and emoji glyphs render as two terminal cells but JS String#length and the model's own padding count them as one, so any markdown table with Chinese / Japanese / Korean cells drifts right per row when a real terminal renders it. Both surfaces fix this with a display-cell width measurement (wcswidth on the Python side, stringWidth on the TUI side). Changes: - agent/markdown_tables.py: new helper. realign_markdown_tables(text) detects markdown table blocks (header + |---| divider) and rewrites the row padding using wcwidth.wcswidth so every pipe and dash lines up across rows. No-op on text without tables. - cli.py: hook the helper into _render_final_assistant_content for strip / render modes (raw passes through untouched), and into the streaming line emitter so live token-by-token rendering also produces aligned tables. A small two-buffer state machine in _emit_stream_text holds table rows until the block ends, then flushes them through the realigner so all rows pad to a single per-column width. - ui-tui/src/components/markdown.tsx: renderTable now uses stringWidth (Bun.stringWidth fast path + East-Asian-width-aware fallback, already memoised in @hermes/ink) instead of UTF-16 String#length for both column-width measurement and per-cell padding. Drops the comment that documented the bug as a deliberate limitation. Validation: - New tests/agent/test_markdown_tables.py (11): every rebuilt block shares pipe column offsets across rows for pure CJK, mixed CJK+emoji, ragged-row, and multi-table inputs. - Updated tests/cli/test_cli_markdown_rendering.py: the existing strip-mode test asserted exact whitespace; rewritten to assert the alignment contract (cell content survives + every rendered row shares pipe offsets). - New ui-tui markdown.test.ts case (1): rendered column-2 start offset is identical for the header + every body row, including the CJK row that drifted before the fix. - Live: hermes chat -q with the user-reported screenshot prompt now produces a perfectly aligned table on the wire (header, divider, 4 body rows including '通义千问', all pipes at identical columns).
This commit is contained in:
parent
657874460f
commit
1d00716754
6 changed files with 559 additions and 15 deletions
170
agent/markdown_tables.py
Normal file
170
agent/markdown_tables.py
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
"""CJK/wide-character-aware re-alignment of model-emitted markdown tables.
|
||||
|
||||
Models pad markdown tables assuming each character occupies one terminal
|
||||
cell. CJK glyphs and most emoji render as two cells, so the model's
|
||||
spacing collapses into drift the moment a table reaches a real terminal —
|
||||
header pipes line up, every body row drifts right by N cells per CJK
|
||||
char.
|
||||
|
||||
This module rebuilds row padding using ``wcwidth.wcswidth`` (display
|
||||
columns), preserving the table's pipes and dashes so it still reads as a
|
||||
plain-text table in ``strip`` / unrendered display modes. Standard Rich
|
||||
markdown rendering already aligns CJK correctly inside a wide enough
|
||||
panel; this helper is for the paths that print the model's text more or
|
||||
less verbatim.
|
||||
|
||||
The helper is deliberately conservative:
|
||||
|
||||
* Only contiguous ``| ... |`` blocks with a divider line are rewritten.
|
||||
* Anything that does not look like a table is passed through unchanged.
|
||||
* Single-line / mid-stream fragments are left alone — callers buffer
|
||||
table rows and flush them once the block is complete.
|
||||
|
||||
There is a small, intentional caveat: ``wcwidth`` returns ``-1`` for some
|
||||
emoji-with-variation-selector sequences (e.g. ``⚠️``); we clamp those to
|
||||
0 so they do not corrupt the column width math. The 1-cell drift on
|
||||
those specific glyphs is preferable to silently widening every table
|
||||
that contains one.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
from wcwidth import wcswidth
|
||||
|
||||
__all__ = [
|
||||
"is_table_divider",
|
||||
"looks_like_table_row",
|
||||
"realign_markdown_tables",
|
||||
"split_table_row",
|
||||
]
|
||||
|
||||
|
||||
_DIVIDER_CELL_RE = re.compile(r"^\s*:?-{3,}:?\s*$")
|
||||
_MIN_COL_WIDTH = 3 # matches the divider's minimum dash run.
|
||||
|
||||
|
||||
def _disp_width(s: str) -> int:
|
||||
"""``wcswidth`` clamped to a non-negative integer.
|
||||
|
||||
``wcswidth`` returns ``-1`` when it encounters a control char or an
|
||||
unknown sequence; treat those as zero-width rather than letting a
|
||||
negative number flow into ``max`` and break the column-width math.
|
||||
"""
|
||||
|
||||
w = wcswidth(s)
|
||||
return w if w > 0 else 0
|
||||
|
||||
|
||||
def _pad_to_width(s: str, target: int) -> str:
|
||||
return s + " " * max(0, target - _disp_width(s))
|
||||
|
||||
|
||||
def split_table_row(row: str) -> List[str]:
|
||||
"""Split ``| a | b | c |`` into ``["a", "b", "c"]`` with trims."""
|
||||
|
||||
s = row.strip()
|
||||
if s.startswith("|"):
|
||||
s = s[1:]
|
||||
if s.endswith("|"):
|
||||
s = s[:-1]
|
||||
return [c.strip() for c in s.split("|")]
|
||||
|
||||
|
||||
def is_table_divider(row: str) -> bool:
|
||||
"""True when ``row`` is a markdown table separator line."""
|
||||
|
||||
cells = split_table_row(row)
|
||||
return len(cells) > 1 and all(_DIVIDER_CELL_RE.match(c) for c in cells)
|
||||
|
||||
|
||||
def looks_like_table_row(row: str) -> bool:
|
||||
"""True when ``row`` could plausibly be a markdown table row.
|
||||
|
||||
Used by streaming callers to decide whether to buffer an in-flight
|
||||
line. We are intentionally permissive here — the realigner itself
|
||||
only rewrites blocks that are accompanied by a divider, so a false
|
||||
positive here at most delays the print of one line.
|
||||
"""
|
||||
|
||||
if "|" not in row:
|
||||
return False
|
||||
stripped = row.strip()
|
||||
if not stripped:
|
||||
return False
|
||||
# A leading pipe is the strongest signal; without it we still allow
|
||||
# rows with at least two pipes so models that omit the leading pipe
|
||||
# don't slip past us.
|
||||
if stripped.startswith("|"):
|
||||
return True
|
||||
return stripped.count("|") >= 2
|
||||
|
||||
|
||||
def _render_block(rows: List[List[str]]) -> List[str]:
|
||||
"""Render ``rows`` (header + body, divider implied) at uniform widths."""
|
||||
|
||||
ncols = max(len(r) for r in rows)
|
||||
rows = [r + [""] * (ncols - len(r)) for r in rows]
|
||||
|
||||
widths = [
|
||||
max(_MIN_COL_WIDTH, *(_disp_width(r[c]) for r in rows))
|
||||
for c in range(ncols)
|
||||
]
|
||||
|
||||
def _row(cells: List[str]) -> str:
|
||||
return (
|
||||
"| "
|
||||
+ " | ".join(_pad_to_width(c, widths[k]) for k, c in enumerate(cells))
|
||||
+ " |"
|
||||
)
|
||||
|
||||
out = [_row(rows[0])]
|
||||
out.append("|" + "|".join("-" * (w + 2) for w in widths) + "|")
|
||||
for r in rows[1:]:
|
||||
out.append(_row(r))
|
||||
return out
|
||||
|
||||
|
||||
def realign_markdown_tables(text: str) -> str:
|
||||
"""Rewrite every ``| ... |`` + divider block with wcwidth-aware padding.
|
||||
|
||||
Lines that are not part of a recognised table are returned verbatim,
|
||||
so this is safe to apply to arbitrary assistant prose.
|
||||
"""
|
||||
|
||||
if "|" not in text:
|
||||
return text
|
||||
|
||||
lines = text.split("\n")
|
||||
out: List[str] = []
|
||||
i = 0
|
||||
n = len(lines)
|
||||
|
||||
while i < n:
|
||||
line = lines[i]
|
||||
# A table starts with a header row whose next line is a divider.
|
||||
if (
|
||||
"|" in line
|
||||
and i + 1 < n
|
||||
and is_table_divider(lines[i + 1])
|
||||
):
|
||||
header = split_table_row(line)
|
||||
body: List[List[str]] = []
|
||||
j = i + 2
|
||||
while j < n and "|" in lines[j] and lines[j].strip():
|
||||
if is_table_divider(lines[j]):
|
||||
j += 1
|
||||
continue
|
||||
body.append(split_table_row(lines[j]))
|
||||
j += 1
|
||||
|
||||
if any(c for c in header) or body:
|
||||
out.extend(_render_block([header] + body))
|
||||
i = j
|
||||
continue
|
||||
out.append(line)
|
||||
i += 1
|
||||
|
||||
return "\n".join(out)
|
||||
Loading…
Add table
Add a link
Reference in a new issue