Skip to content

Commit 39d2fa9

Browse files
committed
Optimize utf8Write
1 parent 922b89e commit 39d2fa9

File tree

1 file changed

+145
-83
lines changed

1 file changed

+145
-83
lines changed

index.js

+145-83
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ function byteLength (string, encoding) {
480480
return len
481481
case 'utf8':
482482
case 'utf-8':
483-
return utf8ToBytes(string).length
483+
return utf8ByteLength(string)
484484
case 'ucs2':
485485
case 'ucs-2':
486486
case 'utf16le':
@@ -492,7 +492,7 @@ function byteLength (string, encoding) {
492492
return base64ToBytes(string).length
493493
default:
494494
if (loweredCase) {
495-
return mustMatch ? -1 : utf8ToBytes(string).length // assume utf8
495+
return mustMatch ? -1 : utf8ByteLength(string) // assume utf8
496496
}
497497
encoding = ('' + encoding).toLowerCase()
498498
loweredCase = true
@@ -870,7 +870,141 @@ function hexWrite (buf, string, offset, length) {
870870
}
871871

872872
function utf8Write (buf, string, offset, length) {
873-
return blitBuffer(utf8ToBytes(string, buf.length - offset), buf, offset, length)
873+
let remaining = length
874+
let leadSurrogate = 0
875+
let pos = offset
876+
877+
for (let i = 0; i < string.length; i++) {
878+
let codePoint = string.charCodeAt(i)
879+
880+
// is surrogate component
881+
if (codePoint > 0xd7ff && codePoint < 0xe000) {
882+
// last char was a lead
883+
if (!leadSurrogate) {
884+
// no lead yet
885+
if (codePoint > 0xdbff) {
886+
// unexpected trail
887+
if (remaining >= 3) pos = writeInvalid(buf, pos)
888+
remaining -= 3
889+
continue
890+
} else if (i + 1 === string.length) {
891+
// unpaired lead
892+
if (remaining >= 3) pos = writeInvalid(buf, pos)
893+
remaining -= 3
894+
continue
895+
}
896+
897+
// valid lead
898+
leadSurrogate = codePoint
899+
900+
continue
901+
}
902+
903+
// 2 leads in a row
904+
if (codePoint < 0xdc00) {
905+
if (remaining >= 3) pos = writeInvalid(buf, pos)
906+
remaining -= 3
907+
leadSurrogate = codePoint
908+
continue
909+
}
910+
911+
// valid surrogate pair
912+
codePoint -= 0xdc00
913+
codePoint |= (leadSurrogate - 0xd800) << 10
914+
codePoint += 0x10000
915+
} else if (leadSurrogate) {
916+
// valid bmp char, but last char was a lead
917+
if (remaining >= 3) pos = writeInvalid(buf, pos)
918+
remaining -= 3
919+
}
920+
921+
leadSurrogate = 0
922+
923+
// encode utf8
924+
if (codePoint < 0x80) {
925+
if (remaining < 1) break
926+
buf[pos++] = codePoint
927+
remaining -= 1
928+
} else if (codePoint < 0x800) {
929+
if (remaining < 2) break
930+
buf[pos++] = (codePoint >> 6) | 0xc0
931+
buf[pos++] = (codePoint & 0x3f) | 0x80
932+
remaining -= 2
933+
} else if (codePoint < 0x10000) {
934+
if (remaining < 3) break
935+
buf[pos++] = (codePoint >> 12) | 0xe0
936+
buf[pos++] = ((codePoint >> 6) & 0x3f) | 0x80
937+
buf[pos++] = (codePoint & 0x3f) | 0x80
938+
remaining -= 3
939+
} else if (codePoint < 0x110000) {
940+
if (remaining < 4) break
941+
buf[pos++] = (codePoint >> 18) | 0xf0
942+
buf[pos++] = ((codePoint >> 12) & 0x3f) | 0x80
943+
buf[pos++] = ((codePoint >> 6) & 0x3f) | 0x80
944+
buf[pos++] = (codePoint & 0x3f) | 0x80
945+
remaining -= 4
946+
} else {
947+
throw new Error('Invalid code point')
948+
}
949+
}
950+
951+
return pos - offset
952+
}
953+
954+
function utf8ByteLength (string) {
955+
let leadSurrogate = 0
956+
let size = 0
957+
958+
for (let i = 0; i < string.length; i++) {
959+
let codePoint = string.charCodeAt(i)
960+
961+
// is surrogate component
962+
if (codePoint > 0xd7ff && codePoint < 0xe000) {
963+
// last char was a lead
964+
if (!leadSurrogate) {
965+
// no lead yet
966+
if (codePoint > 0xdbff) {
967+
// unexpected trail
968+
size += 3
969+
continue
970+
} else if (i + 1 === string.length) {
971+
// unpaired lead
972+
size += 3
973+
continue
974+
}
975+
976+
// valid lead
977+
leadSurrogate = codePoint
978+
979+
continue
980+
}
981+
982+
// 2 leads in a row
983+
if (codePoint < 0xdc00) {
984+
size += 3
985+
leadSurrogate = codePoint
986+
continue
987+
}
988+
989+
// valid surrogate pair
990+
codePoint -= 0xdc00
991+
codePoint |= (leadSurrogate - 0xd800) << 10
992+
codePoint += 0x10000
993+
} else if (leadSurrogate) {
994+
// valid bmp char, but last char was a lead
995+
size += 3
996+
}
997+
998+
leadSurrogate = 0
999+
1000+
// encode utf8
1001+
size += 1
1002+
size += (codePoint >= 0x80) | 0
1003+
size += (codePoint >= 0x800) | 0
1004+
size += (codePoint >= 0x10000) | 0
1005+
}
1006+
1007+
return size
8741008
}
8751009

8761010
function asciiWrite (buf, string, offset, length) {
@@ -1990,90 +2124,18 @@ function base64clean (str) {
19902124
return str
19912125
}
19922126

1993-
function utf8ToBytes (string, units) {
1994-
units = units || Infinity
1995-
let codePoint
1996-
const length = string.length
1997-
let leadSurrogate = null
1998-
const bytes = []
1999-
2000-
for (let i = 0; i < length; ++i) {
2001-
codePoint = string.charCodeAt(i)
2002-
2003-
// is surrogate component
2004-
if (codePoint > 0xD7FF && codePoint < 0xE000) {
2005-
// last char was a lead
2006-
if (!leadSurrogate) {
2007-
// no lead yet
2008-
if (codePoint > 0xDBFF) {
2009-
// unexpected trail
2010-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2011-
continue
2012-
} else if (i + 1 === length) {
2013-
// unpaired lead
2014-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2015-
continue
2016-
}
2017-
2018-
// valid lead
2019-
leadSurrogate = codePoint
2020-
2021-
continue
2022-
}
2023-
2024-
// 2 leads in a row
2025-
if (codePoint < 0xDC00) {
2026-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2027-
leadSurrogate = codePoint
2028-
continue
2029-
}
2030-
2031-
// valid surrogate pair
2032-
codePoint = (leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00) + 0x10000
2033-
} else if (leadSurrogate) {
2034-
// valid bmp char, but last char was a lead
2035-
if ((units -= 3) > -1) bytes.push(0xEF, 0xBF, 0xBD)
2036-
}
2037-
2038-
leadSurrogate = null
2039-
2040-
// encode utf8
2041-
if (codePoint < 0x80) {
2042-
if ((units -= 1) < 0) break
2043-
bytes.push(codePoint)
2044-
} else if (codePoint < 0x800) {
2045-
if ((units -= 2) < 0) break
2046-
bytes.push(
2047-
codePoint >> 0x6 | 0xC0,
2048-
codePoint & 0x3F | 0x80
2049-
)
2050-
} else if (codePoint < 0x10000) {
2051-
if ((units -= 3) < 0) break
2052-
bytes.push(
2053-
codePoint >> 0xC | 0xE0,
2054-
codePoint >> 0x6 & 0x3F | 0x80,
2055-
codePoint & 0x3F | 0x80
2056-
)
2057-
} else if (codePoint < 0x110000) {
2058-
if ((units -= 4) < 0) break
2059-
bytes.push(
2060-
codePoint >> 0x12 | 0xF0,
2061-
codePoint >> 0xC & 0x3F | 0x80,
2062-
codePoint >> 0x6 & 0x3F | 0x80,
2063-
codePoint & 0x3F | 0x80
2064-
)
2065-
} else {
2066-
throw new Error('Invalid code point')
2067-
}
2068-
}
2069-
2070-
return bytes
2071-
}
2072-
20732127
function base64ToBytes (str) {
20742128
return base64.toByteArray(base64clean(str))
20752129
}
20762130

2131+
function writeInvalid (buf, pos) {
2132+
// U+FFFD (Replacement Character)
2133+
buf[pos++] = 0xef
2134+
buf[pos++] = 0xbf
2135+
buf[pos++] = 0xbd
2136+
return pos
2137+
}
2138+
20772139
function blitBuffer (src, dst, offset, length) {
20782140
let i
20792141
for (i = 0; i < length; ++i) {

0 commit comments

Comments
 (0)