From 3b2448aa3532bbf61adc0c0d59227536c455b5ad Mon Sep 17 00:00:00 2001
From: Daniel Gray <the.daniel.gray@gmail.com>
Date: Sun, 22 Aug 2021 00:57:28 +0200
Subject: [PATCH] Speed up the branchless UTF-8 decoder by removing !len
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In your post, you say: "Adding that !len is actually somewhat costly,
though I couldn’t figure out why."

My suspicion was that it is because the "!" operator would essentially
behave like a branch, returning 1 if the input is 0 and 0 otherwise.

So, my idea was to copy the table of lengths you have and create another
one for "error lengths" to get that same effect (0 when it's OK and 1
when there is an error, to ensure that it moves forward at least one
byte, as mentioned).

The throughput went up from 504 MB/s to 557 MB/s on my machine.
---
 utf8.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/utf8.h b/utf8.h
index 8c6a7a0..bb40c41 100644
--- a/utf8.h
+++ b/utf8.h
@@ -29,6 +29,10 @@ utf8_decode(void *buf, uint32_t *c, int *e)
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0
     };
+    static const char err_extras[] = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1
+    };
     static const int masks[]  = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
     static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
     static const int shiftc[] = {0, 18, 12, 6, 0};
@@ -36,12 +40,13 @@ utf8_decode(void *buf, uint32_t *c, int *e)
 
     unsigned char *s = buf;
     int len = lengths[s[0] >> 3];
+    int err_extra_len = err_extras[s[0] >> 3];
 
     /* Compute the pointer to the next character early so that the next
      * iteration can start working on the next character. Neither Clang
      * nor GCC figure out this reordering on their own.
      */
-    unsigned char *next = s + len + !len;
+    unsigned char *next = s + len + err_extra_len;
 
     /* Assume a four-byte character and load four bytes. Unused bits are
      * shifted out.