fix(tui): wrap CJK text at grapheme boundaries

closes #5495
This commit is contained in:
Mario Zechner
2026-06-09 12:51:27 +02:00
Unverified
parent def99d395e
commit 8da077bcca
3 changed files with 75 additions and 16 deletions
+4
View File
@@ -2,6 +2,10 @@
## [Unreleased]
### Fixed
- Fixed wrapping for mixed Latin and CJK text so unspaced CJK runs can break at grapheme boundaries without leaving large trailing gaps ([#5495](https://github.com/earendil-works/pi/issues/5495)).
## [0.79.0] - 2026-06-08
### Fixed
+47 -16
View File
@@ -45,6 +45,9 @@ const rgiEmojiRegex = /^\p{RGI_Emoji}$/v;
const WIDTH_CACHE_SIZE = 512;
const widthCache = new Map<string, number>();
const cjkBreakRegex =
/[\p{Script_Extensions=Han}\p{Script_Extensions=Hiragana}\p{Script_Extensions=Katakana}\p{Script_Extensions=Hangul}\p{Script_Extensions=Bopomofo}]/u;
function isPrintableAscii(str: string): boolean {
for (let i = 0; i < str.length; i++) {
const code = str.charCodeAt(i);
@@ -605,9 +608,18 @@ function splitIntoTokensWithAnsi(text: string): string[] {
const tokens: string[] = [];
let current = "";
let pendingAnsi = ""; // ANSI codes waiting to be attached to next visible content
let inWhitespace = false;
let currentKind: "space" | "word" | null = null;
let i = 0;
const flushCurrent = (): void => {
if (!current) {
return;
}
tokens.push(current);
current = "";
currentKind = null;
};
while (i < text.length) {
const ansiResult = extractAnsiCode(text, i);
if (ansiResult) {
@@ -617,29 +629,48 @@ function splitIntoTokensWithAnsi(text: string): string[] {
continue;
}
const char = text[i];
const charIsSpace = char === " ";
if (charIsSpace !== inWhitespace && current) {
// Switching between whitespace and non-whitespace, push current token
tokens.push(current);
current = "";
let end = i;
while (end < text.length && !extractAnsiCode(text, end)) {
end++;
}
// Attach any pending ANSI codes to this visible character
if (pendingAnsi) {
current += pendingAnsi;
pendingAnsi = "";
for (const { segment } of graphemeSegmenter.segment(text.slice(i, end))) {
const segmentIsSpace = segment === " ";
if (!segmentIsSpace && cjkBreakRegex.test(segment)) {
flushCurrent();
const token = pendingAnsi + segment;
pendingAnsi = "";
tokens.push(token);
continue;
}
const segmentKind = segmentIsSpace ? "space" : "word";
if (current && currentKind !== segmentKind) {
flushCurrent();
}
// Attach any pending ANSI codes to this visible character
if (pendingAnsi) {
current += pendingAnsi;
pendingAnsi = "";
}
currentKind = segmentKind;
current += segment;
}
inWhitespace = charIsSpace;
current += char;
i++;
i = end;
}
// Handle any remaining pending ANSI codes (attach to last token)
if (pendingAnsi) {
current += pendingAnsi;
if (current) {
current += pendingAnsi;
} else if (tokens.length > 0) {
tokens[tokens.length - 1] += pendingAnsi;
} else {
current = pendingAnsi;
}
}
if (current) {
+24
View File
@@ -111,6 +111,30 @@ describe("wrapTextWithAnsi", () => {
}
});
it("should break CJK runs at grapheme boundaries after Latin text", () => {
const text = "This is an example 中文汉字测试段落内容中文汉字测试段落内容.";
const wrapped = wrapTextWithAnsi(text, 40);
assert.deepStrictEqual(wrapped, ["This is an example 中文汉字测试段落内容", "中文汉字测试段落内容."]);
for (const line of wrapped) {
assert.ok(visibleWidth(line) <= 40);
}
});
it("should preserve color codes when wrapping CJK runs", () => {
const red = "\x1b[31m";
const reset = "\x1b[0m";
const text = `${red}This is an example 中文汉字测试段落内容中文汉字测试段落内容.${reset}`;
const wrapped = wrapTextWithAnsi(text, 40);
assert.strictEqual(wrapped.length, 2);
assert.strictEqual(wrapped[0], `${red}This is an example 中文汉字测试段落内容`);
assert.strictEqual(wrapped[1], `${red}中文汉字测试段落内容.${reset}`);
for (const line of wrapped) {
assert.ok(visibleWidth(line) <= 40);
}
});
it("should ignore OSC 133 semantic markers in visible width", () => {
const text = "\x1b]133;A\x07hello\x1b]133;B\x07";
assert.strictEqual(visibleWidth(text), 5);