-
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindex.ts
85 lines (73 loc) · 2.29 KB
/
index.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import { UNICODE_RANGES } from "./languages";
const CHINESE_MAX_CODE_POINT = 205743;
const CHINESE_MIN_CODE_POINT = 11904;
const BYTE_SIZE = 8;
// CHAR_MAP is used to determine whether a codepoint is a word boundary
// or not. Instead of taking 1 byte per codepoint, we divide each byte
// into 8 indices which reduces the memory footprint from 205.7 KB
// to 25.7 KB.
// The extra 1 byte at the end is required to insert the codepoint at the
// last index.
const BITMAP = new Uint8Array(CHINESE_MAX_CODE_POINT / BYTE_SIZE + 1);
function insertCharsIntoMap(...chars: string[]) {
for (const char of chars) {
const charCode = char.charCodeAt(0);
const byteIndex = Math.floor(charCode / BYTE_SIZE);
const bitIndex = charCode % BYTE_SIZE;
BITMAP[byteIndex] = BITMAP[byteIndex] ^ (1 << bitIndex);
}
}
function insertRangeIntoMap(from: number, to: number) {
for (let i = from / BYTE_SIZE; i < Math.ceil(to / BYTE_SIZE); i++) {
BITMAP[i] = 0b11111111;
}
}
const NEWLINE = "\n";
insertCharsIntoMap(
" ",
"\n",
"\t",
"\v",
"*",
"/",
"&",
":",
";",
".",
",",
"?",
"=",
"\u0F0B", // Tibetan uses [U+0F0B TIBETAN MARK INTERSYLLABIC TSHEG] (pronounced tsek) to signal the end of a syllable.
"\u1361", // Ethiopic text uses the traditional wordspace character [U+1361 ETHIOPIC WORDSPACE] to indicate word boundaries
"\u200b" // ZERO-WIDTH-SPACE can also be considered a word boundary
);
for (const range of UNICODE_RANGES) {
insertRangeIntoMap(range[0], range[1]);
}
export function countWords(str: string) {
let count = 0;
let shouldCount = false;
for (let i = 0; i < str.length; i++) {
const charCode = str.charCodeAt(i);
const byteIndex = (charCode / BYTE_SIZE) | 0;
const bitIndex = charCode % BYTE_SIZE;
const byteAtIndex = BITMAP[byteIndex];
const isMatch = ((byteAtIndex >> bitIndex) & 1) === 1;
// 255 means this is probably a Unicode range match in which case
// we should ignore the value of shouldCount
if (isMatch && (shouldCount || byteAtIndex === 255)) count++;
shouldCount = !isMatch;
}
if (shouldCount) count++;
return count;
}
export function countLines(str: string) {
let count = 0;
for (
let i = -1;
(i = str.indexOf(NEWLINE, ++i)) !== -1 && i < str.length;
count++
);
count++;
return count;
}