@@ -480,7 +480,7 @@ function byteLength (string, encoding) {
480
480
return len
481
481
case 'utf8' :
482
482
case 'utf-8' :
483
- return utf8ToBytes ( string ) . length
483
+ return utf8ByteLength ( string )
484
484
case 'ucs2' :
485
485
case 'ucs-2' :
486
486
case 'utf16le' :
@@ -492,7 +492,7 @@ function byteLength (string, encoding) {
492
492
return base64ToBytes ( string ) . length
493
493
default :
494
494
if ( loweredCase ) {
495
- return mustMatch ? - 1 : utf8ToBytes ( string ) . length // assume utf8
495
+ return mustMatch ? - 1 : utf8ByteLength ( string ) // assume utf8
496
496
}
497
497
encoding = ( '' + encoding ) . toLowerCase ( )
498
498
loweredCase = true
@@ -870,7 +870,141 @@ function hexWrite (buf, string, offset, length) {
870
870
}
871
871
872
872
function utf8Write ( buf , string , offset , length ) {
873
- return blitBuffer ( utf8ToBytes ( string , buf . length - offset ) , buf , offset , length )
873
+ let remaining = length
874
+ let leadSurrogate = 0
875
+ let pos = offset
876
+
877
+ for ( let i = 0 ; i < string . length ; i ++ ) {
878
+ let codePoint = string . charCodeAt ( i )
879
+
880
+ // is surrogate component
881
+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
882
+ // last char was a lead
883
+ if ( ! leadSurrogate ) {
884
+ // no lead yet
885
+ if ( codePoint > 0xdbff ) {
886
+ // unexpected trail
887
+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
888
+ remaining -= 3
889
+ continue
890
+ } else if ( i + 1 === string . length ) {
891
+ // unpaired lead
892
+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
893
+ remaining -= 3
894
+ continue
895
+ }
896
+
897
+ // valid lead
898
+ leadSurrogate = codePoint
899
+
900
+ continue
901
+ }
902
+
903
+ // 2 leads in a row
904
+ if ( codePoint < 0xdc00 ) {
905
+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
906
+ remaining -= 3
907
+ leadSurrogate = codePoint
908
+ continue
909
+ }
910
+
911
+ // valid surrogate pair
912
+ codePoint -= 0xdc00
913
+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
914
+ codePoint += 0x10000
915
+ } else if ( leadSurrogate ) {
916
+ // valid bmp char, but last char was a lead
917
+ if ( remaining >= 3 ) pos = writeInvalid ( buf , pos )
918
+ remaining -= 3
919
+ }
920
+
921
+ leadSurrogate = 0
922
+
923
+ // encode utf8
924
+ if ( codePoint < 0x80 ) {
925
+ if ( remaining < 1 ) break
926
+ buf [ pos ++ ] = codePoint
927
+ remaining -= 1
928
+ } else if ( codePoint < 0x800 ) {
929
+ if ( remaining < 2 ) break
930
+ buf [ pos ++ ] = ( codePoint >> 6 ) | 0xc0
931
+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
932
+ remaining -= 2
933
+ } else if ( codePoint < 0x10000 ) {
934
+ if ( remaining < 3 ) break
935
+ buf [ pos ++ ] = ( codePoint >> 12 ) | 0xe0
936
+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
937
+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
938
+ remaining -= 3
939
+ } else if ( codePoint < 0x110000 ) {
940
+ if ( remaining < 4 ) break
941
+ buf [ pos ++ ] = ( codePoint >> 18 ) | 0xf0
942
+ buf [ pos ++ ] = ( ( codePoint >> 12 ) & 0x3f ) | 0x80
943
+ buf [ pos ++ ] = ( ( codePoint >> 6 ) & 0x3f ) | 0x80
944
+ buf [ pos ++ ] = ( codePoint & 0x3f ) | 0x80
945
+ remaining -= 4
946
+ } else {
947
+ throw new Error ( 'Invalid code point' )
948
+ }
949
+ }
950
+
951
+ return pos - offset
952
+ }
953
+
954
+ function utf8ByteLength ( string ) {
955
+ let leadSurrogate = 0
956
+ let size = 0
957
+
958
+ for ( let i = 0 ; i < string . length ; i ++ ) {
959
+ let codePoint = string . charCodeAt ( i )
960
+
961
+ // is surrogate component
962
+ if ( codePoint > 0xd7ff && codePoint < 0xe000 ) {
963
+ // last char was a lead
964
+ if ( ! leadSurrogate ) {
965
+ // no lead yet
966
+ if ( codePoint > 0xdbff ) {
967
+ // unexpected trail
968
+ size += 3
969
+ continue
970
+ } else if ( i + 1 === string . length ) {
971
+ // unpaired lead
972
+ size += 3
973
+ continue
974
+ }
975
+
976
+ // valid lead
977
+ leadSurrogate = codePoint
978
+
979
+ continue
980
+ }
981
+
982
+ // 2 leads in a row
983
+ if ( codePoint < 0xdc00 ) {
984
+ size += 3
985
+ leadSurrogate = codePoint
986
+ continue
987
+ }
988
+
989
+ // valid surrogate pair
990
+ codePoint -= 0xdc00
991
+ codePoint |= ( leadSurrogate - 0xd800 ) << 10
992
+ codePoint += 0x10000
993
+ } else if ( leadSurrogate ) {
994
+ // valid bmp char, but last char was a lead
995
+ size += 3
996
+ }
997
+
998
+ leadSurrogate = 0
999
+
1000
+ // encode utf8
1001
+ size += 1
1002
+ size += ( codePoint >= 0x80 ) | 0
1003
+ size += ( codePoint >= 0x800 ) | 0
1004
+ size += ( codePoint >= 0x10000 ) | 0
1005
+ }
1006
+
1007
+ return size
874
1008
}
875
1009
876
1010
function asciiWrite ( buf , string , offset , length ) {
@@ -1990,90 +2124,18 @@ function base64clean (str) {
1990
2124
return str
1991
2125
}
1992
2126
1993
- function utf8ToBytes ( string , units ) {
1994
- units = units || Infinity
1995
- let codePoint
1996
- const length = string . length
1997
- let leadSurrogate = null
1998
- const bytes = [ ]
1999
-
2000
- for ( let i = 0 ; i < length ; ++ i ) {
2001
- codePoint = string . charCodeAt ( i )
2002
-
2003
- // is surrogate component
2004
- if ( codePoint > 0xD7FF && codePoint < 0xE000 ) {
2005
- // last char was a lead
2006
- if ( ! leadSurrogate ) {
2007
- // no lead yet
2008
- if ( codePoint > 0xDBFF ) {
2009
- // unexpected trail
2010
- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2011
- continue
2012
- } else if ( i + 1 === length ) {
2013
- // unpaired lead
2014
- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2015
- continue
2016
- }
2017
-
2018
- // valid lead
2019
- leadSurrogate = codePoint
2020
-
2021
- continue
2022
- }
2023
-
2024
- // 2 leads in a row
2025
- if ( codePoint < 0xDC00 ) {
2026
- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2027
- leadSurrogate = codePoint
2028
- continue
2029
- }
2030
-
2031
- // valid surrogate pair
2032
- codePoint = ( leadSurrogate - 0xD800 << 10 | codePoint - 0xDC00 ) + 0x10000
2033
- } else if ( leadSurrogate ) {
2034
- // valid bmp char, but last char was a lead
2035
- if ( ( units -= 3 ) > - 1 ) bytes . push ( 0xEF , 0xBF , 0xBD )
2036
- }
2037
-
2038
- leadSurrogate = null
2039
-
2040
- // encode utf8
2041
- if ( codePoint < 0x80 ) {
2042
- if ( ( units -= 1 ) < 0 ) break
2043
- bytes . push ( codePoint )
2044
- } else if ( codePoint < 0x800 ) {
2045
- if ( ( units -= 2 ) < 0 ) break
2046
- bytes . push (
2047
- codePoint >> 0x6 | 0xC0 ,
2048
- codePoint & 0x3F | 0x80
2049
- )
2050
- } else if ( codePoint < 0x10000 ) {
2051
- if ( ( units -= 3 ) < 0 ) break
2052
- bytes . push (
2053
- codePoint >> 0xC | 0xE0 ,
2054
- codePoint >> 0x6 & 0x3F | 0x80 ,
2055
- codePoint & 0x3F | 0x80
2056
- )
2057
- } else if ( codePoint < 0x110000 ) {
2058
- if ( ( units -= 4 ) < 0 ) break
2059
- bytes . push (
2060
- codePoint >> 0x12 | 0xF0 ,
2061
- codePoint >> 0xC & 0x3F | 0x80 ,
2062
- codePoint >> 0x6 & 0x3F | 0x80 ,
2063
- codePoint & 0x3F | 0x80
2064
- )
2065
- } else {
2066
- throw new Error ( 'Invalid code point' )
2067
- }
2068
- }
2069
-
2070
- return bytes
2071
- }
2072
-
2073
2127
function base64ToBytes ( str ) {
2074
2128
return base64 . toByteArray ( base64clean ( str ) )
2075
2129
}
2076
2130
2131
+ function writeInvalid ( buf , pos ) {
2132
+ // U+FFFD (Replacement Character)
2133
+ buf [ pos ++ ] = 0xef
2134
+ buf [ pos ++ ] = 0xbf
2135
+ buf [ pos ++ ] = 0xbd
2136
+ return pos
2137
+ }
2138
+
2077
2139
function blitBuffer ( src , dst , offset , length ) {
2078
2140
let i
2079
2141
for ( i = 0 ; i < length ; ++ i ) {
0 commit comments