ncodeunits(s::AbstractString) gives number of code units

StefanKarpinski · StefanKarpinski · commit 9fe6082d9a11 · 2017-11-22T14:12:22.000Z
diff --git a/NEWS.md b/NEWS.md
@@ -388,6 +388,10 @@ Library improvements
     This supersedes the old behavior of reinterpret on Arrays. As a result, reinterpreting
     arrays with different alignment requirements (removed in 0.6) is once again allowed ([#23750]).
 
+  * New function `ncodeunits(s::AbstractString)` gives the number of code units in a string.
+    The generic definition is constant time but calls `endof(s)` which may be inefficient.
+    Therefore custom string types may want to define direct `ncodeunits` methods.
+
 Compiler/Runtime improvements
 -----------------------------
 
diff --git a/base/exports.jl b/base/exports.jl
@@ -757,6 +757,7 @@ export
     lstrip,
     match,
     matchall,
+    ncodeunits,
     ndigits,
     nextind,
     normalize_string,
diff --git a/base/strings/basic.jl b/base/strings/basic.jl
@@ -69,6 +69,10 @@ julia> 'j' * "ulia"
 
 one(::Union{T,Type{T}}) where {T<:AbstractString} = convert(T, "")
 
+# generic number of code units; implementations generally know how long a string
+# is though and should override this with a more efficient method
+ncodeunits(s::AbstractString) = nextind(s, endof(s)) - 1
+
 """
     length(s::AbstractString)
 
diff --git a/base/strings/string.jl b/base/strings/string.jl
@@ -87,6 +87,20 @@ codeunit(s::AbstractString, i::Integer)
     @gc_preserve s unsafe_load(pointer(s, i))
 end
 
+"""
+    ncodeunits(s::AbstractString)
+
+The number of code units in a string. For eample, for UTF-8-like data such as
+the default `String` type, the number of code units is the number of bytes in
+the string, aka `sizeof(s)`. For a UTF-16 encoded string type, however, the
+code unit is `UInt16` so the number of code units is the number of `UInt16`
+words in the representation of the string. The expression `codeunit(s, i)` is
+valid and safe for precisely the range of `i` values `1:ncodeunits(s)`.
+
+See also: [`codeunit`](@ref).
+"""
+ncodeunits(s::String) = sizeof(s)
+
 write(io::IO, s::String) =
     @gc_preserve s unsafe_write(io, pointer(s), reinterpret(UInt, sizeof(s)))
 
diff --git a/test/strings/basic.jl b/test/strings/basic.jl
@@ -697,3 +697,11 @@ end
     @test String(take!(b)) == "UnicodeError: invalid character index 2 (0xba is a continuation byte)"
 end
 
+@testset "ncodeunits" begin
+    for (s, n) in [""     => 0, "a"   => 1, "abc"  => 3,
+                   "α"    => 2, "abγ" => 4, "∀"    => 3,
+                   "∀x∃y" => 8, "🍕"  => 4, "🍕∀" => 7]
+        @test ncodeunits(s) == n
+        @test ncodeunits(GenericString(s)) == n
+    end
+end