Skip to content

Commit c74447e

Browse files
committed
Improve Korean keep-all mixed-script breaks
1 parent b0b86e1 commit c74447e

3 files changed

Lines changed: 196 additions & 10 deletions

File tree

src/analysis.ts

Lines changed: 107 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,34 @@ export function setAnalysisLocale(locale?: string): void {
9898
const arabicScriptRe = /\p{Script=Arabic}/u
9999
const combiningMarkRe = /\p{M}/u
100100
const decimalDigitRe = /\p{Nd}/u
101+
// Korean app/product labels often mix Hangul with ASCII letters/digits and
102+
// lightweight token punctuation, but URL/query/key-value separators should
103+
// remain structural boundaries instead of being folded into one token.
104+
const keepAllTextRunSeparators = new Set(['/', '?', '&', '=', ':'])
105+
const koreanKeepAllInnerPunctuation = new Set(['.', '-', '_', '(', ')'])
101106

102107
function containsArabicScript(text: string): boolean {
103108
return arabicScriptRe.test(text)
104109
}
105110

111+
function isHangulCodePoint(codePoint: number): boolean {
112+
return (
113+
(codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
114+
(codePoint >= 0x1100 && codePoint <= 0x11FF) ||
115+
(codePoint >= 0x3130 && codePoint <= 0x318F) ||
116+
(codePoint >= 0xA960 && codePoint <= 0xA97F) ||
117+
(codePoint >= 0xD7B0 && codePoint <= 0xD7FF)
118+
)
119+
}
120+
121+
function isAsciiAlphaNumericCodePoint(codePoint: number): boolean {
122+
return (
123+
(codePoint >= 0x30 && codePoint <= 0x39) ||
124+
(codePoint >= 0x41 && codePoint <= 0x5A) ||
125+
(codePoint >= 0x61 && codePoint <= 0x7A)
126+
)
127+
}
128+
106129
function isCJKCodePoint(codePoint: number): boolean {
107130
return (
108131
(codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
@@ -146,6 +169,47 @@ export function isCJK(s: string): boolean {
146169
return false
147170
}
148171

172+
function containsHangulText(text: string): boolean {
173+
for (const ch of text) {
174+
if (isHangulCodePoint(ch.codePointAt(0)!)) return true
175+
}
176+
return false
177+
}
178+
179+
export function containsKeepAllTextRunSeparator(text: string): boolean {
180+
for (const ch of text) {
181+
if (keepAllTextRunSeparators.has(ch)) return true
182+
}
183+
return false
184+
}
185+
186+
function containsBlockingKeepAllTextRunEntrySeparator(text: string): boolean {
187+
let offset = 0
188+
for (const ch of text) {
189+
offset += ch.length
190+
if (!keepAllTextRunSeparators.has(ch)) continue
191+
if ((ch === '?' || ch === ':') && offset === text.length) continue
192+
return true
193+
}
194+
return false
195+
}
196+
197+
function isKoreanKeepAllCompactText(text: string): boolean {
198+
if (text.length === 0) return false
199+
for (const ch of text) {
200+
const codePoint = ch.codePointAt(0)!
201+
if (
202+
isHangulCodePoint(codePoint) ||
203+
isAsciiAlphaNumericCodePoint(codePoint) ||
204+
koreanKeepAllInnerPunctuation.has(ch)
205+
) {
206+
continue
207+
}
208+
return false
209+
}
210+
return true
211+
}
212+
149213
function endsWithLineStartProhibitedText(text: string): boolean {
150214
const last = getLastCodePoint(text)
151215
return last !== null && (kinsokuStart.has(last) || leftStickyPunctuation.has(last))
@@ -174,6 +238,23 @@ export function canContinueKeepAllTextRun(previousText: string): boolean {
174238
)
175239
}
176240

241+
export function canContinueKeepAllTextRunAcrossBoundary(previousText: string, nextText: string): boolean {
242+
const hasHangulBoundary = containsHangulText(previousText) || containsHangulText(nextText)
243+
if (!hasHangulBoundary) return canContinueKeepAllTextRun(previousText)
244+
245+
return (
246+
canContinueKeepAllTextRun(previousText) &&
247+
!containsKeepAllTextRunSeparator(previousText) &&
248+
!containsBlockingKeepAllTextRunEntrySeparator(nextText)
249+
)
250+
}
251+
252+
export function canContinueKeepAllTextRunForKorean(previousText: string, nextText: string): boolean {
253+
if (!canContinueKeepAllTextRunAcrossBoundary(previousText, nextText)) return false
254+
if (!containsHangulText(previousText) && !containsHangulText(nextText)) return false
255+
return isKoreanKeepAllCompactText(previousText) && isKoreanKeepAllCompactText(nextText)
256+
}
257+
177258
export const kinsokuStart = new Set([
178259
'\uFF0C',
179260
'\uFF0E',
@@ -1194,14 +1275,18 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
11941275
let pendingStart = 0
11951276
let pendingContainsCJK = false
11961277
let pendingCanContinue = false
1278+
let pendingStartedAfterKeepAllSeparator = false
1279+
let previousTextHadKeepAllSeparator = false
11971280

11981281
function flushPendingText(): void {
11991282
if (pendingTextParts === null) return
1200-
texts.push(joinTextParts(pendingTextParts))
1283+
const text = joinTextParts(pendingTextParts)
1284+
texts.push(text)
12011285
isWordLike.push(pendingWordLike)
12021286
kinds.push('text')
12031287
starts.push(pendingStart)
12041288
pendingTextParts = null
1289+
previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text)
12051290
}
12061291

12071292
for (let i = 0; i < segmentation.len; i++) {
@@ -1214,12 +1299,25 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12141299
const textContainsCJK = containsCJKText(text)
12151300
const textCanContinue = canContinueKeepAllTextRun(text)
12161301

1217-
if (pendingTextParts !== null && pendingContainsCJK && pendingCanContinue) {
1218-
pendingTextParts.push(text)
1219-
pendingWordLike = pendingWordLike || wordLike
1220-
pendingContainsCJK = pendingContainsCJK || textContainsCJK
1221-
pendingCanContinue = textCanContinue
1222-
continue
1302+
if (pendingTextParts !== null) {
1303+
const previousText = pendingTextParts[pendingTextParts.length - 1]!
1304+
const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, text)
1305+
const canUseDefaultCJKKeepAll =
1306+
pendingContainsCJK &&
1307+
pendingCanContinue &&
1308+
canContinueAcrossBoundary &&
1309+
(!pendingStartedAfterKeepAllSeparator || textContainsCJK)
1310+
const canUseKoreanKeepAll =
1311+
!pendingStartedAfterKeepAllSeparator &&
1312+
canContinueKeepAllTextRunForKorean(previousText, text)
1313+
1314+
if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) {
1315+
pendingTextParts.push(text)
1316+
pendingWordLike = pendingWordLike || wordLike
1317+
pendingContainsCJK = pendingContainsCJK || textContainsCJK
1318+
pendingCanContinue = textCanContinue
1319+
continue
1320+
}
12231321
}
12241322

12251323
flushPendingText()
@@ -1228,6 +1326,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12281326
pendingStart = start
12291327
pendingContainsCJK = textContainsCJK
12301328
pendingCanContinue = textCanContinue
1329+
pendingStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator
12311330
continue
12321331
}
12331332

@@ -1236,6 +1335,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12361335
isWordLike.push(wordLike)
12371336
kinds.push(kind)
12381337
starts.push(start)
1338+
previousTextHadKeepAllSeparator = false
12391339
}
12401340

12411341
flushPendingText()

src/layout.test.ts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,74 @@ describe('prepare invariants', () => {
570570
expect(prepareWithSegments('foo\u00A0世界', FONT, { wordBreak: 'keep-all' }).segments).toEqual(['foo\u00A0', '世界'])
571571
})
572572

573+
test('keep-all keeps compact Korean mixed-script tokens together', () => {
574+
for (const text of [
575+
'AI정보공학과',
576+
'README카드생성기',
577+
'api문서v2가이드',
578+
'2026학년도공지',
579+
'한글ABC123혼합문장',
580+
'GitHubREADME한글가이드',
581+
'공지사항v2업데이트',
582+
]) {
583+
expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text])
584+
}
585+
})
586+
587+
test('keep-all does not merge Korean text across path and query separators', () => {
588+
expect(prepareWithSegments('검색어?정렬=최신', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
589+
'검색어?',
590+
'정렬',
591+
'=',
592+
'최신',
593+
])
594+
expect(prepareWithSegments('docs/README한글가이드', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
595+
'docs',
596+
'/',
597+
'README',
598+
'한글가이드',
599+
])
600+
expect(prepareWithSegments('hello:한글테스트', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
601+
'hello:',
602+
'한글테스트',
603+
])
604+
expect(prepareWithSegments('path/to/한글문서', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
605+
'path',
606+
'/',
607+
'to',
608+
'/',
609+
'한글문서',
610+
])
611+
expect(prepareWithSegments('key=value한글', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
612+
'key',
613+
'=',
614+
'value',
615+
'한글',
616+
])
617+
expect(prepareWithSegments('한글&영문조합', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
618+
'한글',
619+
'&',
620+
'영문조합',
621+
])
622+
})
623+
624+
test('keep-all preserves Korean mixed-punctuation token behavior', () => {
625+
for (const text of [
626+
'공지사항(수정본)',
627+
'v2.1한글업데이트',
628+
'한글-영문-혼합',
629+
'한글_영문_조합',
630+
]) {
631+
expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text])
632+
}
633+
634+
expect(prepareWithSegments('AI\u200B정보공학과', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
635+
'AI',
636+
'\u200B',
637+
'정보공학과',
638+
])
639+
})
640+
573641
test('adjacent CJK text units stay breakable after visible text, not only after spaces', () => {
574642
const prepared = prepareWithSegments('foo 世界 bar', FONT)
575643
expect(prepared.segments).toEqual(['foo', ' ', '世', '界', ' ', 'bar'])

src/layout.ts

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@ import { computeSegmentLevels } from './bidi.js'
3636
import {
3737
analyzeText,
3838
canContinueKeepAllTextRun,
39+
canContinueKeepAllTextRunAcrossBoundary,
40+
canContinueKeepAllTextRunForKorean,
3941
clearAnalysisCaches,
42+
containsKeepAllTextRunSeparator,
4043
endsWithClosingQuote,
4144
isCJK,
4245
isNumericRunSegment,
@@ -282,20 +285,34 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] {
282285
let currentStart = units[0]!.start
283286
let currentContainsCJK = isCJK(units[0]!.text)
284287
let currentCanContinue = canContinueKeepAllTextRun(units[0]!.text)
288+
let currentStartedAfterKeepAllSeparator = false
289+
let previousTextHadKeepAllSeparator = false
285290

286291
function flushCurrent(): void {
292+
const text = currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join('')
287293
merged.push({
288-
text: currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join(''),
294+
text,
289295
start: currentStart,
290296
})
297+
previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text)
291298
}
292299

293300
for (let i = 1; i < units.length; i++) {
294301
const next = units[i]!
295302
const nextContainsCJK = isCJK(next.text)
296303
const nextCanContinue = canContinueKeepAllTextRun(next.text)
297-
298-
if (currentContainsCJK && currentCanContinue) {
304+
const previousText = currentTextParts[currentTextParts.length - 1]!
305+
const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, next.text)
306+
const canUseDefaultCJKKeepAll =
307+
currentContainsCJK &&
308+
currentCanContinue &&
309+
canContinueAcrossBoundary &&
310+
(!currentStartedAfterKeepAllSeparator || nextContainsCJK)
311+
const canUseKoreanKeepAll =
312+
!currentStartedAfterKeepAllSeparator &&
313+
canContinueKeepAllTextRunForKorean(previousText, next.text)
314+
315+
if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) {
299316
currentTextParts.push(next.text)
300317
currentContainsCJK = currentContainsCJK || nextContainsCJK
301318
currentCanContinue = nextCanContinue
@@ -307,6 +324,7 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] {
307324
currentStart = next.start
308325
currentContainsCJK = nextContainsCJK
309326
currentCanContinue = nextCanContinue
327+
currentStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator
310328
}
311329

312330
flushCurrent()

0 commit comments

Comments
 (0)