Skip to content

Commit 9fe6082

Browse files
ncodeunits(s::AbstractString) gives number of code units
1 parent f5dcbd1 commit 9fe6082

File tree

5 files changed

+31
-0
lines changed

5 files changed

+31
-0
lines changed

NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,10 @@ Library improvements
388388
This supersedes the old behavior of reinterpret on Arrays. As a result, reinterpreting
389389
arrays with different alignment requirements (removed in 0.6) is once again allowed ([#23750]).
390390

391+
* New function `ncodeunits(s::AbstractString)` gives the number of code units in a string.
392+
The generic definition is constant time but calls `endof(s)` which may be inefficient.
393+
Therefore custom string types may want to define direct `ncodeunits` methods.
394+
391395
Compiler/Runtime improvements
392396
-----------------------------
393397

base/exports.jl

+1
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,7 @@ export
757757
lstrip,
758758
match,
759759
matchall,
760+
ncodeunits,
760761
ndigits,
761762
nextind,
762763
normalize_string,

base/strings/basic.jl

+4
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ julia> 'j' * "ulia"
6969

7070
one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
7171

72+
# generic number of code units; implementations generally know how long a string
73+
# is though and should override this with a more efficient method
74+
ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1
75+
7276
"""
7377
length(s::AbstractString)
7478

base/strings/string.jl

+14
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer)
8787
@gc_preserve s unsafe_load(pointer(s, i))
8888
end
8989

90+
"""
91+
ncodeunits(s::AbstractString)
92+
93+
The number of code units in a string. For eample, for UTF-8-like data such as
94+
the default `String` type, the number of code units is the number of bytes in
95+
the string, aka `sizeof(s)`. For a UTF-16 encoded string type, however, the
96+
code unit is `UInt16` so the number of code units is the number of `UInt16`
97+
words in the representation of the string. The expression `codeunit(s, i)` is
98+
valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
99+
100+
See also: [`codeunit`](@ref).
101+
"""
102+
ncodeunits(s::String) = sizeof(s)
103+
90104
write(io::IO, s::String) =
91105
@gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))
92106

test/strings/basic.jl

+8
Original file line numberDiff line numberDiff line change
@@ -697,3 +697,11 @@ end
697697
@test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)"
698698
end
699699

700+
@testset "ncodeunits" begin
701+
for (s, n) in ["" => 0, "a" => 1, "abc" => 3,
702+
"α" => 2, "abγ" => 4, "" => 3,
703+
"∀x∃y" => 8, "🍕" => 4, "🍕∀" => 7]
704+
@test ncodeunits(s) == n
705+
@test ncodeunits(GenericString(s)) == n
706+
end
707+
end

0 commit comments

Comments
 (0)