@@ -98,11 +98,34 @@ export function setAnalysisLocale(locale?: string): void {
9898const arabicScriptRe = / \p{ Script= Arabic} / u
9999const combiningMarkRe = / \p{ M} / u
100100const decimalDigitRe = / \p{ Nd} / u
101+ // Korean app/product labels often mix Hangul with ASCII letters/digits and
102+ // lightweight token punctuation, but URL/query/key-value separators should
103+ // remain structural boundaries instead of being folded into one token.
104+ const keepAllTextRunSeparators = new Set ( [ '/' , '?' , '&' , '=' , ':' ] )
105+ const koreanKeepAllInnerPunctuation = new Set ( [ '.' , '-' , '_' , '(' , ')' ] )
101106
102107function containsArabicScript ( text : string ) : boolean {
103108 return arabicScriptRe . test ( text )
104109}
105110
111+ function isHangulCodePoint ( codePoint : number ) : boolean {
112+ return (
113+ ( codePoint >= 0xAC00 && codePoint <= 0xD7AF ) ||
114+ ( codePoint >= 0x1100 && codePoint <= 0x11FF ) ||
115+ ( codePoint >= 0x3130 && codePoint <= 0x318F ) ||
116+ ( codePoint >= 0xA960 && codePoint <= 0xA97F ) ||
117+ ( codePoint >= 0xD7B0 && codePoint <= 0xD7FF )
118+ )
119+ }
120+
121+ function isAsciiAlphaNumericCodePoint ( codePoint : number ) : boolean {
122+ return (
123+ ( codePoint >= 0x30 && codePoint <= 0x39 ) ||
124+ ( codePoint >= 0x41 && codePoint <= 0x5A ) ||
125+ ( codePoint >= 0x61 && codePoint <= 0x7A )
126+ )
127+ }
128+
106129function isCJKCodePoint ( codePoint : number ) : boolean {
107130 return (
108131 ( codePoint >= 0x4E00 && codePoint <= 0x9FFF ) ||
@@ -146,6 +169,47 @@ export function isCJK(s: string): boolean {
146169 return false
147170}
148171
172+ function containsHangulText ( text : string ) : boolean {
173+ for ( const ch of text ) {
174+ if ( isHangulCodePoint ( ch . codePointAt ( 0 ) ! ) ) return true
175+ }
176+ return false
177+ }
178+
179+ export function containsKeepAllTextRunSeparator ( text : string ) : boolean {
180+ for ( const ch of text ) {
181+ if ( keepAllTextRunSeparators . has ( ch ) ) return true
182+ }
183+ return false
184+ }
185+
186+ function containsBlockingKeepAllTextRunEntrySeparator ( text : string ) : boolean {
187+ let offset = 0
188+ for ( const ch of text ) {
189+ offset += ch . length
190+ if ( ! keepAllTextRunSeparators . has ( ch ) ) continue
191+ if ( ( ch === '?' || ch === ':' ) && offset === text . length ) continue
192+ return true
193+ }
194+ return false
195+ }
196+
197+ function isKoreanKeepAllCompactText ( text : string ) : boolean {
198+ if ( text . length === 0 ) return false
199+ for ( const ch of text ) {
200+ const codePoint = ch . codePointAt ( 0 ) !
201+ if (
202+ isHangulCodePoint ( codePoint ) ||
203+ isAsciiAlphaNumericCodePoint ( codePoint ) ||
204+ koreanKeepAllInnerPunctuation . has ( ch )
205+ ) {
206+ continue
207+ }
208+ return false
209+ }
210+ return true
211+ }
212+
149213function endsWithLineStartProhibitedText ( text : string ) : boolean {
150214 const last = getLastCodePoint ( text )
151215 return last !== null && ( kinsokuStart . has ( last ) || leftStickyPunctuation . has ( last ) )
@@ -174,6 +238,23 @@ export function canContinueKeepAllTextRun(previousText: string): boolean {
174238 )
175239}
176240
241+ export function canContinueKeepAllTextRunAcrossBoundary ( previousText : string , nextText : string ) : boolean {
242+ const hasHangulBoundary = containsHangulText ( previousText ) || containsHangulText ( nextText )
243+ if ( ! hasHangulBoundary ) return canContinueKeepAllTextRun ( previousText )
244+
245+ return (
246+ canContinueKeepAllTextRun ( previousText ) &&
247+ ! containsKeepAllTextRunSeparator ( previousText ) &&
248+ ! containsBlockingKeepAllTextRunEntrySeparator ( nextText )
249+ )
250+ }
251+
252+ export function canContinueKeepAllTextRunForKorean ( previousText : string , nextText : string ) : boolean {
253+ if ( ! canContinueKeepAllTextRunAcrossBoundary ( previousText , nextText ) ) return false
254+ if ( ! containsHangulText ( previousText ) && ! containsHangulText ( nextText ) ) return false
255+ return isKoreanKeepAllCompactText ( previousText ) && isKoreanKeepAllCompactText ( nextText )
256+ }
257+
177258export const kinsokuStart = new Set ( [
178259 '\uFF0C' ,
179260 '\uFF0E' ,
@@ -1194,14 +1275,18 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
11941275 let pendingStart = 0
11951276 let pendingContainsCJK = false
11961277 let pendingCanContinue = false
1278+ let pendingStartedAfterKeepAllSeparator = false
1279+ let previousTextHadKeepAllSeparator = false
11971280
11981281 function flushPendingText ( ) : void {
11991282 if ( pendingTextParts === null ) return
1200- texts . push ( joinTextParts ( pendingTextParts ) )
1283+ const text = joinTextParts ( pendingTextParts )
1284+ texts . push ( text )
12011285 isWordLike . push ( pendingWordLike )
12021286 kinds . push ( 'text' )
12031287 starts . push ( pendingStart )
12041288 pendingTextParts = null
1289+ previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator ( text )
12051290 }
12061291
12071292 for ( let i = 0 ; i < segmentation . len ; i ++ ) {
@@ -1214,12 +1299,25 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12141299 const textContainsCJK = containsCJKText ( text )
12151300 const textCanContinue = canContinueKeepAllTextRun ( text )
12161301
1217- if ( pendingTextParts !== null && pendingContainsCJK && pendingCanContinue ) {
1218- pendingTextParts . push ( text )
1219- pendingWordLike = pendingWordLike || wordLike
1220- pendingContainsCJK = pendingContainsCJK || textContainsCJK
1221- pendingCanContinue = textCanContinue
1222- continue
1302+ if ( pendingTextParts !== null ) {
1303+ const previousText = pendingTextParts [ pendingTextParts . length - 1 ] !
1304+ const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary ( previousText , text )
1305+ const canUseDefaultCJKKeepAll =
1306+ pendingContainsCJK &&
1307+ pendingCanContinue &&
1308+ canContinueAcrossBoundary &&
1309+ ( ! pendingStartedAfterKeepAllSeparator || textContainsCJK )
1310+ const canUseKoreanKeepAll =
1311+ ! pendingStartedAfterKeepAllSeparator &&
1312+ canContinueKeepAllTextRunForKorean ( previousText , text )
1313+
1314+ if ( canUseDefaultCJKKeepAll || canUseKoreanKeepAll ) {
1315+ pendingTextParts . push ( text )
1316+ pendingWordLike = pendingWordLike || wordLike
1317+ pendingContainsCJK = pendingContainsCJK || textContainsCJK
1318+ pendingCanContinue = textCanContinue
1319+ continue
1320+ }
12231321 }
12241322
12251323 flushPendingText ( )
@@ -1228,6 +1326,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12281326 pendingStart = start
12291327 pendingContainsCJK = textContainsCJK
12301328 pendingCanContinue = textCanContinue
1329+ pendingStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator
12311330 continue
12321331 }
12331332
@@ -1236,6 +1335,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
12361335 isWordLike . push ( wordLike )
12371336 kinds . push ( kind )
12381337 starts . push ( start )
1338+ previousTextHadKeepAllSeparator = false
12391339 }
12401340
12411341 flushPendingText ( )
0 commit comments