Skip to content

Commit 9a223c8

Browse files
Merge pull request #16590 from JuliaLang/sk/highlander5
remove UTF-16 and UTF-32 stuff
2 parents 752e63e + 3098e6c commit 9a223c8

30 files changed

+91
-1628
lines changed

base/deprecated.jl

-10
Original file line numberDiff line numberDiff line change
@@ -488,16 +488,6 @@ end
488488
end
489489
)
490490

491-
if sizeof(Cwchar_t) == 2
492-
@deprecate_binding WString UTF16String
493-
@deprecate_binding wstring utf16
494-
utf16(s::Cwstring) = utf16(convert(Ptr{Cwchar_t}, s))
495-
elseif sizeof(Cwchar_t) == 4
496-
@deprecate_binding WString UTF32String
497-
@deprecate_binding wstring utf32
498-
utf32(s::Cwstring) = utf32(convert(Ptr{Cwchar_t}, s))
499-
end
500-
501491
@deprecate ==(x::Char, y::Integer) UInt32(x) == y
502492
@deprecate ==(x::Integer, y::Char) x == UInt32(y)
503493
@deprecate isless(x::Char, y::Integer) UInt32(x) < y

base/docs/helpdb/Base.jl

+5-59
Original file line numberDiff line numberDiff line change
@@ -95,32 +95,6 @@ Get the step size of a [`Range`](:obj:`Range`) object.
9595
"""
9696
step
9797

98-
"""
99-
utf32(s)
100-
101-
Create a UTF-32 string from a byte array, array of `Char` or `UInt32`, or any other string
102-
type. (Conversions of byte arrays check for a byte-order marker in the first four bytes, and
103-
do not include it in the resulting string.)
104-
105-
Note that the resulting `UTF32String` data is terminated by the NUL codepoint (32-bit zero),
106-
which is not treated as a character in the string (so that it is mostly invisible in Julia);
107-
this allows the string to be passed directly to external functions requiring NUL-terminated
108-
data. This NUL is appended automatically by the `utf32(s)` conversion function. If you have
109-
a `Char` or `UInt32` array `A` that is already NUL-terminated UTF-32 data, then you can
110-
instead use `UTF32String(A)` to construct the string without making a copy of the data and
111-
treating the NUL as a terminator rather than as part of the string.
112-
"""
113-
utf32(s)
114-
115-
"""
116-
utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}} [, length])
117-
118-
Create a string from the address of a NUL-terminated UTF-32 string. A copy is made; the
119-
pointer can be safely freed. If `length` is specified, the string does not have to be
120-
NUL-terminated.
121-
"""
122-
utf32(::Union{Ptr{Char},Ptr{UInt32},Ptr{Int32}}, length=?)
123-
12498
"""
12599
takebuf_array(b::IOBuffer)
126100
@@ -3610,32 +3584,6 @@ Compute ``\\sin(\\pi x) / (\\pi x)`` if ``x \\neq 0``, and ``1`` if ``x = 0``.
36103584
"""
36113585
sinc
36123586

3613-
"""
3614-
utf16(s)
3615-
3616-
Create a UTF-16 string from a byte array, array of `UInt16`, or any other string type. (Data
3617-
must be valid UTF-16. Conversions of byte arrays check for a byte-order marker in the first
3618-
two bytes, and do not include it in the resulting string.)
3619-
3620-
Note that the resulting `UTF16String` data is terminated by the NUL codepoint (16-bit zero),
3621-
which is not treated as a character in the string (so that it is mostly invisible in Julia);
3622-
this allows the string to be passed directly to external functions requiring NUL-terminated
3623-
data. This NUL is appended automatically by the `utf16(s)` conversion function. If you have
3624-
a `UInt16` array `A` that is already NUL-terminated valid UTF-16 data, then you can instead
3625-
use `UTF16String(A)` to construct the string without making a copy of the data and treating
3626-
the NUL as a terminator rather than as part of the string.
3627-
"""
3628-
utf16(s)
3629-
3630-
"""
3631-
utf16(::Union{Ptr{UInt16},Ptr{Int16}} [, length])
3632-
3633-
Create a string from the address of a NUL-terminated UTF-16 string. A copy is made; the
3634-
pointer can be safely freed. If `length` is specified, the string does not have to be
3635-
NUL-terminated.
3636-
"""
3637-
utf16(::Union{Ptr{UInt16},Ptr{Int16}}, length=?)
3638-
36393587
"""
36403588
median(v[, region])
36413589
@@ -8859,19 +8807,17 @@ vecnorm
88598807
"""
88608808
isvalid(value) -> Bool
88618809
8862-
Returns `true` if the given value is valid for its type, which currently can be one of
8863-
`Char`, `String`, `UTF16String`, or `UTF32String`.
8810+
Returns `true` if the given value is valid for its type, which currently can be either
8811+
`Char` or `String`.
88648812
"""
88658813
isvalid(value)
88668814

88678815
"""
88688816
isvalid(T, value) -> Bool
88698817
8870-
Returns `true` if the given value is valid for that type. Types currently can be `Char`,
8871-
`String`, `UTF16String`, or `UTF32String` Values for `Char` can be of
8872-
type `Char` or `UInt32` Values for `String` can be of that type, or
8873-
`Vector{UInt8}` Values for `UTF16String` can be `UTF16String` or `Vector{UInt16}` Values for
8874-
`UTF32String` can be `UTF32String`, `Vector{Char}` or `Vector{UInt32}`
8818+
Returns `true` if the given value is valid for that type. Types currently can
8819+
be either `Char` or `String`. Values for `Char` can be of type `Char` or `UInt32`.
8820+
Values for `String` can be of that type, or `Vector{UInt8}`.
88758821
"""
88768822
isvalid(T,value)
88778823

base/exports.jl

-4
Original file line numberDiff line numberDiff line change
@@ -120,8 +120,6 @@ export
120120
Tridiagonal,
121121
UnitRange,
122122
UpperTriangular,
123-
UTF16String,
124-
UTF32String,
125123
Val,
126124
VecOrMat,
127125
Vector,
@@ -879,8 +877,6 @@ export
879877
ucfirst,
880878
unescape_string,
881879
uppercase,
882-
utf16,
883-
utf32,
884880
warn,
885881

886882
# random numbers

base/replutil.jl

+11-1
Original file line numberDiff line numberDiff line change
@@ -233,14 +233,24 @@ end
233233
showerror(io::IO, ::DivideError) = print(io, "DivideError: integer division error")
234234
showerror(io::IO, ::StackOverflowError) = print(io, "StackOverflowError:")
235235
showerror(io::IO, ::UndefRefError) = print(io, "UndefRefError: access to undefined reference")
236-
showerror(io::IO, ex::UndefVarError) = print(io, "UndefVarError: $(ex.var) not defined")
237236
showerror(io::IO, ::EOFError) = print(io, "EOFError: read end of file")
238237
showerror(io::IO, ex::ErrorException) = print(io, ex.msg)
239238
showerror(io::IO, ex::KeyError) = print(io, "KeyError: key $(repr(ex.key)) not found")
240239
showerror(io::IO, ex::InterruptException) = print(io, "InterruptException:")
241240
showerror(io::IO, ex::ArgumentError) = print(io, "ArgumentError: $(ex.msg)")
242241
showerror(io::IO, ex::AssertionError) = print(io, "AssertionError: $(ex.msg)")
243242

243+
function showerror(io::IO, ex::UndefVarError)
244+
if ex.var in [:UTF16String, :UTF32String, :WString, :utf16, :utf32, :wstring]
245+
return showerror(io, ErrorException("""
246+
`$(ex.var)` has been moved to the package LegacyStrings.jl:
247+
Run Pkg.add("LegacyStrings") to install LegacyStrings on Julia v0.5-;
248+
Then do `using LegacyStrings` to get `$(ex.var)`.
249+
"""))
250+
end
251+
print(io, "UndefVarError: $(ex.var) not defined")
252+
end
253+
244254
function showerror(io::IO, ex::MethodError)
245255
# ex.args is a tuple type if it was thrown from `invoke` and is
246256
# a tuple of the arguments otherwise.

base/serialize.jl

+2-3
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ const TAGS = Any[
2121
Symbol, Tuple, Expr, # dummy entries, intentionally shadowed by earlier ones
2222
LineNumberNode, Slot, LabelNode, GotoNode,
2323
QuoteNode, :reserved23 #=was TopNode=#, TypeVar, Core.Box, LambdaInfo,
24-
Module, #=UndefRefTag=#Symbol, Task, String,
25-
UTF16String, UTF32String, Float16,
24+
Module, #=UndefRefTag=#Symbol, Task, String, Float16,
2625
SimpleVector, #=BackrefTag=#Symbol, Method, GlobalRef,
2726

2827
(), Bool, Any, :Any, Bottom, :reserved21, :reserved22, Type,
@@ -42,7 +41,7 @@ const TAGS = Any[
4241
28, 29, 30, 31, 32
4342
]
4443

45-
const ser_version = 3 # do not make changes without bumping the version #!
44+
const ser_version = 4 # do not make changes without bumping the version #!
4645

4746
const NTAGS = length(TAGS)
4847

base/strings/errors.jl

-16
Original file line numberDiff line numberDiff line change
@@ -3,23 +3,7 @@
33
## Error messages for Unicode / UTF support
44

55
const UTF_ERR_SHORT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> missing one or more continuation bytes)"
6-
const UTF_ERR_CONT = "invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)"
7-
const UTF_ERR_LONG = "invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)"
8-
const UTF_ERR_NOT_LEAD = "not a leading Unicode surrogate code unit at index <<1>> (0x<<2>>)"
9-
const UTF_ERR_NOT_TRAIL = "not a trailing Unicode surrogate code unit at index <<1>> (0x<<2>>)"
10-
const UTF_ERR_NOT_SURROGATE = "not a valid Unicode surrogate code unit at index <<1>> (0x<<2>>)"
11-
const UTF_ERR_MISSING_SURROGATE = "missing trailing Unicode surrogate code unit after index <<1>> (0x<<2>>)"
12-
const UTF_ERR_INVALID = "invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)"
13-
const UTF_ERR_SURROGATE = "surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)"
14-
const UTF_ERR_NULL_16_TERMINATE = "UTF16String data must be NULL-terminated"
15-
const UTF_ERR_NULL_32_TERMINATE = "UTF32String data must be NULL-terminated"
16-
const UTF_ERR_ODD_BYTES_16 = "UTF16String can't have odd number of bytes <<1>>"
17-
const UTF_ERR_ODD_BYTES_32 = "UTF32String must have multiple of 4 bytes <<1>>"
18-
const UTF_ERR_INVALID_CHAR = "invalid Unicode character (0x<<2>> > 0x10ffff)"
19-
const UTF_ERR_INVALID_8 = "invalid UTF-8 data"
20-
const UTF_ERR_INVALID_16 = "invalid UTF-16 data"
216
const UTF_ERR_INVALID_INDEX = "invalid character index"
22-
const UTF_ERR_MAP_CHAR = "map(f,s::AbstractString) requires f to return Char; try map(f,collect(s)) or a comprehension instead"
237

248
type UnicodeError <: Exception
259
errmsg::AbstractString ##< A UTF_ERR_ message

base/strings/io.jl

+19
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,22 @@ function unindent(str::AbstractString, indent::Int; tabwidth=8)
324324
end
325325
takebuf_string(buf)
326326
end
327+
328+
function convert(::Type{String}, chars::AbstractVector{Char})
329+
sprint(length(chars), io->begin
330+
state = start(chars)
331+
while !done(chars, state)
332+
c, state = next(chars, state)
333+
if '\ud7ff' < c && c + 1024 < '\ue000'
334+
d, state = next(chars, state)
335+
if '\ud7ff' < d - 1024 && d < '\ue000'
336+
c = Char(0x10000 + ((UInt32(c) & 0x03ff) << 10) | (UInt32(d) & 0x03ff))
337+
else
338+
write(io, c)
339+
c = d
340+
end
341+
end
342+
write(io, c)
343+
end
344+
end)
345+
end

base/strings/string.jl

+4-103
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ const utf8_trailing = [
3333

3434
## required core functionality ##
3535

36+
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
37+
3638
function endof(s::String)
3739
d = s.data
3840
i = length(d)
@@ -102,7 +104,7 @@ function first_utf8_byte(ch::Char)
102104
end
103105

104106
function reverseind(s::String, i::Integer)
105-
j = lastidx(s) + 1 - i
107+
j = length(s.data) + 1 - i
106108
d = s.data
107109
while is_valid_continuation(d[j])
108110
j -= 1
@@ -114,8 +116,6 @@ end
114116

115117
sizeof(s::String) = sizeof(s.data)
116118

117-
lastidx(s::String) = length(s.data)
118-
119119
isvalid(s::String, i::Integer) =
120120
(1 <= i <= endof(s.data)) && !is_valid_continuation(s.data[i])
121121

@@ -239,109 +239,10 @@ function reverse(s::String)
239239
String(buf)
240240
end
241241

242-
## outputting UTF-8 strings ##
243-
244242
write(io::IO, s::String) = write(io, s.data)
245243

246244
pointer(x::String) = pointer(x.data)
247245
pointer(x::String, i::Integer) = pointer(x.data)+(i-1)
248246

249-
## transcoding to UTF-8 ##
250-
251247
convert(::Type{String}, s::String) = s
252-
253-
function convert(::Type{String}, dat::Vector{UInt8})
254-
# handle zero length string quickly
255-
isempty(dat) && return empty_utf8
256-
# get number of bytes to allocate
257-
len, flags, num4byte, num3byte, num2byte = unsafe_checkstring(dat)
258-
if (flags & (UTF_LONG | UTF_SURROGATE)) == 0
259-
len = sizeof(dat)
260-
@inbounds return String(copy!(Vector{UInt8}(len), 1, dat, 1, len))
261-
end
262-
# Copy, but eliminate over-long encodings and surrogate pairs
263-
len += num2byte + num3byte*2 + num4byte*3
264-
buf = Vector{UInt8}(len)
265-
out = 0
266-
pos = 0
267-
@inbounds while out < len
268-
ch::UInt32 = dat[pos += 1]
269-
# Handle ASCII characters
270-
if ch <= 0x7f
271-
buf[out += 1] = ch
272-
# Handle overlong < 0x100
273-
elseif ch < 0xc2
274-
buf[out += 1] = ((ch & 3) << 6) | (dat[pos += 1] & 0x3f)
275-
# Handle 0x100-0x7ff
276-
elseif ch < 0xe0
277-
buf[out += 1] = ch
278-
buf[out += 1] = dat[pos += 1]
279-
elseif ch != 0xed
280-
buf[out += 1] = ch
281-
buf[out += 1] = dat[pos += 1]
282-
buf[out += 1] = dat[pos += 1]
283-
# Copy 4-byte encoded value
284-
ch >= 0xf0 && (buf[out += 1] = dat[pos += 1])
285-
# Handle surrogate pairs
286-
else
287-
ch = dat[pos += 1]
288-
if ch < 0xa0 # not surrogate pairs
289-
buf[out += 1] = 0xed
290-
buf[out += 1] = ch
291-
buf[out += 1] = dat[pos += 1]
292-
else
293-
# Pick up surrogate pairs (CESU-8 format)
294-
ch = ((((((ch & 0x3f) << 6) | (dat[pos + 1] & 0x3f)) << 10)
295-
+ (((dat[pos + 3] & 0x3f)%UInt32 << 6) | (dat[pos + 4] & 0x3f)))
296-
- 0x01f0c00)
297-
pos += 4
298-
output_utf8_4byte!(buf, out, ch)
299-
out += 4
300-
end
301-
end
302-
end
303-
String(buf)
304-
end
305-
306-
"""
307-
Converts an already validated vector of `UInt16` or `UInt32` to a `String`
308-
309-
Input Arguments:
310-
311-
* `dat` Vector of code units (`UInt16` or `UInt32`), explicit `\0` is not converted
312-
* `len` length of output in bytes
313-
314-
Returns:
315-
316-
* `String`
317-
"""
318-
function encode_to_utf8{T<:Union{UInt16, UInt32}}(::Type{T}, dat, len)
319-
buf = Vector{UInt8}(len)
320-
out = 0
321-
pos = 0
322-
@inbounds while out < len
323-
ch::UInt32 = dat[pos += 1]
324-
# Handle ASCII characters
325-
if ch <= 0x7f
326-
buf[out += 1] = ch
327-
# Handle 0x80-0x7ff
328-
elseif ch < 0x800
329-
buf[out += 1] = 0xc0 | (ch >>> 6)
330-
buf[out += 1] = 0x80 | (ch & 0x3f)
331-
# Handle 0x10000-0x10ffff (if input is UInt32)
332-
elseif ch > 0xffff # this is only for T == UInt32, should not be generated for UInt16
333-
output_utf8_4byte!(buf, out, ch)
334-
out += 4
335-
# Handle surrogate pairs
336-
elseif is_surrogate_codeunit(ch)
337-
output_utf8_4byte!(buf, out, get_supplementary(ch, dat[pos += 1]))
338-
out += 4
339-
# Handle 0x800-0xd7ff, 0xe000-0xffff UCS-2 characters
340-
else
341-
buf[out += 1] = 0xe0 | ((ch >>> 12) & 0x3f)
342-
buf[out += 1] = 0x80 | ((ch >>> 6) & 0x3f)
343-
buf[out += 1] = 0x80 | (ch & 0x3f)
344-
end
345-
end
346-
String(buf)
347-
end
248+
convert(::Type{String}, v::Vector{UInt8}) = String(v)

base/strings/types.jl

+3-4
Original file line numberDiff line numberDiff line change
@@ -118,12 +118,11 @@ reverse(s::RevString) = s.string
118118

119119
## reverse an index i so that reverse(s)[i] == s[reverseind(s,i)]
120120

121+
reverseind(s::AbstractString, i) = chr2ind(s, length(s) + 1 - ind2chr(reverse(s), i))
121122
reverseind(s::Union{DirectIndexString,SubString{DirectIndexString}}, i::Integer) = length(s) + 1 - i
122123
reverseind(s::RevString, i::Integer) = endof(s) - i + 1
123-
lastidx(s::AbstractString) = nextind(s, endof(s)) - 1
124-
lastidx(s::DirectIndexString) = length(s)
125-
reverseind(s::SubString, i::Integer) =
126-
reverseind(s.string, lastidx(s.string)-s.offset-s.endof+i) - s.offset
124+
reverseind(s::SubString{String}, i::Integer) =
125+
reverseind(s.string, nextind(s.string, endof(s.string))-s.offset-s.endof+i-1) - s.offset
127126

128127
## efficient representation of repeated strings ##
129128

base/sysimg.jl

-1
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ include("iobuffer.jl")
145145
include("char.jl")
146146
include("intfuncs.jl")
147147
include("strings/strings.jl")
148-
include("unicode/unicode.jl")
149148
include("parse.jl")
150149
include("shell.jl")
151150
include("regex.jl")

0 commit comments

Comments
 (0)