@@ -33,6 +33,8 @@ const utf8_trailing = [
33
33
34
34
# # required core functionality ##
35
35
36
+ is_valid_continuation (c) = ((c & 0xc0 ) == 0x80 )
37
+
36
38
function endof (s:: String )
37
39
d = s. data
38
40
i = length (d)
@@ -102,7 +104,7 @@ function first_utf8_byte(ch::Char)
102
104
end
103
105
104
106
function reverseind (s:: String , i:: Integer )
105
- j = lastidx (s ) + 1 - i
107
+ j = length (s . data ) + 1 - i
106
108
d = s. data
107
109
while is_valid_continuation (d[j])
108
110
j -= 1
114
116
115
117
sizeof (s:: String ) = sizeof (s. data)
116
118
117
- lastidx (s:: String ) = length (s. data)
118
-
119
119
isvalid (s:: String , i:: Integer ) =
120
120
(1 <= i <= endof (s. data)) && ! is_valid_continuation (s. data[i])
121
121
@@ -239,109 +239,10 @@ function reverse(s::String)
239
239
String (buf)
240
240
end
241
241
242
- # # outputting UTF-8 strings ##
243
-
244
242
write (io:: IO , s:: String ) = write (io, s. data)
245
243
246
244
pointer (x:: String ) = pointer (x. data)
247
245
pointer (x:: String , i:: Integer ) = pointer (x. data)+ (i- 1 )
248
246
249
- # # transcoding to UTF-8 ##
250
-
251
247
convert (:: Type{String} , s:: String ) = s
252
-
253
- function convert (:: Type{String} , dat:: Vector{UInt8} )
254
- # handle zero length string quickly
255
- isempty (dat) && return empty_utf8
256
- # get number of bytes to allocate
257
- len, flags, num4byte, num3byte, num2byte = unsafe_checkstring (dat)
258
- if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
259
- len = sizeof (dat)
260
- @inbounds return String (copy! (Vector {UInt8} (len), 1 , dat, 1 , len))
261
- end
262
- # Copy, but eliminate over-long encodings and surrogate pairs
263
- len += num2byte + num3byte* 2 + num4byte* 3
264
- buf = Vector {UInt8} (len)
265
- out = 0
266
- pos = 0
267
- @inbounds while out < len
268
- ch:: UInt32 = dat[pos += 1 ]
269
- # Handle ASCII characters
270
- if ch <= 0x7f
271
- buf[out += 1 ] = ch
272
- # Handle overlong < 0x100
273
- elseif ch < 0xc2
274
- buf[out += 1 ] = ((ch & 3 ) << 6 ) | (dat[pos += 1 ] & 0x3f )
275
- # Handle 0x100-0x7ff
276
- elseif ch < 0xe0
277
- buf[out += 1 ] = ch
278
- buf[out += 1 ] = dat[pos += 1 ]
279
- elseif ch != 0xed
280
- buf[out += 1 ] = ch
281
- buf[out += 1 ] = dat[pos += 1 ]
282
- buf[out += 1 ] = dat[pos += 1 ]
283
- # Copy 4-byte encoded value
284
- ch >= 0xf0 && (buf[out += 1 ] = dat[pos += 1 ])
285
- # Handle surrogate pairs
286
- else
287
- ch = dat[pos += 1 ]
288
- if ch < 0xa0 # not surrogate pairs
289
- buf[out += 1 ] = 0xed
290
- buf[out += 1 ] = ch
291
- buf[out += 1 ] = dat[pos += 1 ]
292
- else
293
- # Pick up surrogate pairs (CESU-8 format)
294
- ch = ((((((ch & 0x3f ) << 6 ) | (dat[pos + 1 ] & 0x3f )) << 10 )
295
- + (((dat[pos + 3 ] & 0x3f )% UInt32 << 6 ) | (dat[pos + 4 ] & 0x3f )))
296
- - 0x01f0c00 )
297
- pos += 4
298
- output_utf8_4byte! (buf, out, ch)
299
- out += 4
300
- end
301
- end
302
- end
303
- String (buf)
304
- end
305
-
306
- """
307
- Converts an already validated vector of `UInt16` or `UInt32` to a `String`
308
-
309
- Input Arguments:
310
-
311
- * `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0 ` is not converted
312
- * `len` length of output in bytes
313
-
314
- Returns:
315
-
316
- * `String`
317
- """
318
- function encode_to_utf8 {T<:Union{UInt16, UInt32}} (:: Type{T} , dat, len)
319
- buf = Vector {UInt8} (len)
320
- out = 0
321
- pos = 0
322
- @inbounds while out < len
323
- ch:: UInt32 = dat[pos += 1 ]
324
- # Handle ASCII characters
325
- if ch <= 0x7f
326
- buf[out += 1 ] = ch
327
- # Handle 0x80-0x7ff
328
- elseif ch < 0x800
329
- buf[out += 1 ] = 0xc0 | (ch >>> 6 )
330
- buf[out += 1 ] = 0x80 | (ch & 0x3f )
331
- # Handle 0x10000-0x10ffff (if input is UInt32)
332
- elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
333
- output_utf8_4byte! (buf, out, ch)
334
- out += 4
335
- # Handle surrogate pairs
336
- elseif is_surrogate_codeunit (ch)
337
- output_utf8_4byte! (buf, out, get_supplementary (ch, dat[pos += 1 ]))
338
- out += 4
339
- # Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
340
- else
341
- buf[out += 1 ] = 0xe0 | ((ch >>> 12 ) & 0x3f )
342
- buf[out += 1 ] = 0x80 | ((ch >>> 6 ) & 0x3f )
343
- buf[out += 1 ] = 0x80 | (ch & 0x3f )
344
- end
345
- end
346
- String (buf)
347
- end
248
+ convert (:: Type{String} , v:: Vector{UInt8} ) = String (v)
0 commit comments