From 3b2448aa3532bbf61adc0c0d59227536c455b5ad Mon Sep 17 00:00:00 2001 From: Daniel Gray Date: Sun, 22 Aug 2021 00:57:28 +0200 Subject: [PATCH] Speed up the branchless UTF-8 decoder by removing !len MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In your post, you say: "Adding that !len is actually somewhat costly, though I couldn’t figure out why." My suspicion was that it is because the "!" operator would essentially behave like a branch, returning 1 if the input is 0 and 0 otherwise. So, my idea was to copy the table of lengths you have and create another one for "error lengths" to get that same effect (0 when it's OK and 1 when there is an error, to ensure that it moves forward at least one byte, as mentioned). The throughput went up from 504 MB/s to 557 MB/s on my machine. --- utf8.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/utf8.h b/utf8.h index 8c6a7a0..bb40c41 100644 --- a/utf8.h +++ b/utf8.h @@ -29,6 +29,10 @@ utf8_decode(void *buf, uint32_t *c, int *e) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0 }; + static const char err_extras[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1 + }; static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07}; static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536}; static const int shiftc[] = {0, 18, 12, 6, 0}; @@ -36,12 +40,13 @@ utf8_decode(void *buf, uint32_t *c, int *e) unsigned char *s = buf; int len = lengths[s[0] >> 3]; + int err_extra_len = err_extras[s[0] >> 3]; /* Compute the pointer to the next character early so that the next * iteration can start working on the next character. Neither Clang * nor GCC figure out this reordering on their own. */ - unsigned char *next = s + len + !len; + unsigned char *next = s + len + err_extra_len; /* Assume a four-byte character and load four bytes. Unused bits are * shifted out.