Skip to content

Commit 1ec3325

Browse files
committed
Fix extend character handling.
1 parent 6ac712d commit 1ec3325

File tree

2 files changed

+35
-21
lines changed

2 files changed

+35
-21
lines changed

Modules/word.c

+31-15
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ static wb_property property(code_point)
7676
}
7777
/**********************************************************************/
7878

79+
__attribute__ ((pure))
80+
static wb_property char_property(node)
81+
const Char *node;
82+
{
83+
if (node == NULL) {
84+
return eot;
85+
}
86+
return property(node->value);
87+
}
88+
7989
/* Returns the next character, skipping Extend and Format characters.
8090
* WB4: Skip over Extend and Format characters. */
8191
__attribute__ ((pure))
@@ -88,7 +98,7 @@ static Char* skip_to_next(from)
8898

8999
do {
90100
from = from->next;
91-
} while (from != NULL && ExtendOrFormat(property(from->value)));
101+
} while (from != NULL && ExtendOrFormat(char_property(from)));
92102
return from;
93103
}
94104

@@ -99,14 +109,21 @@ static Char* skip_twice(from)
99109
return skip_to_next(skip_to_next(from));
100110
}
101111

102-
__attribute__ ((pure))
103-
static wb_property char_property(node)
104-
const Char *node;
112+
/* Returns the last code point of a grapheme, including extend and format
113+
* characters. */
114+
static Char* skip_to_end_of_extend(from)
115+
Char *from;
105116
{
106-
if (node == NULL) {
107-
return eot;
117+
if (from == NULL) {
118+
return NULL;
108119
}
109-
return property(node->value);
120+
121+
/* Skip until the LAST extend or format character. */
122+
while (from->next != NULL && ExtendOrFormat(char_property(from->next))) {
123+
from = from->next;
124+
}
125+
126+
return from;
110127
}
111128

112129
/*
@@ -131,13 +148,12 @@ static Char* find_next_boundary(start)
131148
return NULL;
132149
}
133150

134-
135-
/* Loop to find next word break. */
151+
/* Loop to find the next word break. */
136152

137153
/* WB2: Break at the start and end of text. */
138154
while (current->next != NULL) {
139155
/* Advance all the pointers. */
140-
current = current->next;
156+
current = skip_to_next(current);
141157
lookbehind = left;
142158
left = char_property(current);
143159
right = char_property(skip_to_next(current));
@@ -147,9 +163,9 @@ static Char* find_next_boundary(start)
147163
if (left == CR && right == LF) continue;
148164

149165
/* WB3a: Otherwise break before and after newlines */
150-
if (left == Newline || left == CR || left == LF) return current;
166+
if (left == Newline || left == CR || left == LF) break;
151167
/* WB3b */
152-
if (right == Newline || right == CR || right == LF) return current;
168+
if (right == Newline || right == CR || right == LF) break;
153169

154170
/* Ignore Format and Extend characters, except when they appear at the
155171
* beginning of a region of text. */
@@ -160,7 +176,7 @@ static Char* find_next_boundary(start)
160176

161177
/* WB6: Do not break letters across certain punctuation. */
162178
if (AHLetter(left) &&
163-
(right == MidLetter || MidNumLetQ(right)) &&
179+
(right == MidLetter || MidNumLetQ(right)) &&
164180
AHLetter(left)) continue;
165181
/* WB7 */
166182
if (AHLetter(lookbehind) &&
@@ -215,10 +231,10 @@ static Char* find_next_boundary(start)
215231
right == Regional_Indicator) continue;
216232

217233
/* WB14: Otherwise, break everywhere (including around ideographs). */
218-
return current;
234+
break;
219235
}
220236

221-
return current;
237+
return skip_to_end_of_extend(current);
222238
}
223239
/**********************************************************************/
224240

Tests/word_test.c

+4-6
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ TEST find_words_segments_japanese() {
116116
* instead of four, however, that would involve incorporating a Japanese
117117
* dictionary in order to look-up Kanji words... */
118118
ASSERT_STR_EQ("伝", wordlist->third->string);
119-
ASSERT_STR_EQ("説", wordlist->third->string);
119+
ASSERT_STR_EQ("説", wordlist->fourth->string);
120120

121121
PASS();
122122
}
@@ -148,9 +148,7 @@ SUITE(find_words_suite) {
148148
RUN_TEST(find_words_returns_nfc);
149149
RUN_TEST(find_words_returns_zero_when_not_given_words);
150150
RUN_TEST(find_words_segments_english_with_punctuation);
151-
152-
/* Older tests. May still be useful... */
153-
/*RUN_TEST(find_words_segments_spanish_words);*/
154-
/*RUN_TEST(find_words_segments_numerals);*/
155-
/*RUN_TEST(find_words_segments_japanese);*/
151+
RUN_TEST(find_words_segments_spanish_words);
152+
RUN_TEST(find_words_segments_numerals);
153+
RUN_TEST(find_words_segments_japanese);
156154
}

0 commit comments

Comments
 (0)