Unicode character support in screen tab names

Explorer09 · Explorer09 · commit 62b39cd51608 · 2025-06-05T01:06:33.000+08:00
diff --git a/Action.c b/Action.c
@@ -411,7 +411,8 @@ Htop_Reaction Action_setScreenTab(State* st, int x) {
    int rem = x - SCREEN_TAB_MARGIN_LEFT;
    for (unsigned int i = 0; i < settings->nScreens; i++) {
       const char* tab = settings->screens[i]->heading;
-      int width = rem >= bracketWidth ? (int)strnlen(tab, rem - bracketWidth + 1) : 0;
+      const char* ptr = tab;
+      int width = rem >= bracketWidth ? String_mbswidth(&ptr, SIZE_MAX, rem - bracketWidth + 1) : 0;
       if (width >= rem - bracketWidth + 1) {
          settings->ssIndex = i;
          setActiveScreen(settings, st, i);
diff --git a/ScreenManager.c b/ScreenManager.c
@@ -169,9 +169,10 @@ static inline bool drawTab(const int* y, int* x, int l, const char* name, bool c
    (*x)++;
    if (*x >= l)
       return false;
-   int nameWidth = (int)strnlen(name, l - *x);
+   const char* ptr = name;
+   int nameWidth = String_mbswidth(&ptr, (size_t)INT_MAX, l - *x);
    attrset(CRT_colors[cur ? SCREENS_CUR_TEXT : SCREENS_OTH_TEXT]);
-   mvaddnstr(*y, *x, name, nameWidth);
+   mvaddnstr(*y, *x, name, (int)(ptr - name));
    *x += nameWidth;
    if (*x >= l)
       return false;
diff --git a/XUtils.c b/XUtils.c
@@ -10,8 +10,10 @@ in the source distribution for its full text.
 #include "XUtils.h"
 
 #include <assert.h>
+#include <ctype.h> // IWYU pragma: keep
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h> // IWYU pragma: keep
 #include <math.h>
 #include <stdarg.h>
 #include <stdint.h>
@@ -235,6 +237,246 @@ size_t strnlen(const char* str, size_t maxLen) {
 }
 #endif
 
+#ifdef HAVE_LIBNCURSESW
+static void String_encodeWChar(WCharEncoderState* ps, wchar_t wc) {
+   assert(!ps->buf || ps->pos < ps->size);
+
+   char tempBuf[MB_LEN_MAX];
+   char* dest = ps->buf ? (char*)ps->buf + ps->pos : tempBuf;
+
+   // It is unnecessarily expensive to fix the output string if the caller
+   // gives an incorrect buffer size. This function would not support any
+   // truncation of the output string.
+   size_t len = wcrtomb(dest, wc, &ps->mbState);
+   assert(len > 0);
+   if (len == (size_t)-1) {
+      assert(len != (size_t)-1);
+      fail();
+   }
+   if (ps->buf && len > ps->size - ps->pos) {
+      assert(!ps->buf || len <= ps->size - ps->pos);
+      fail();
+   }
+
+   ps->pos += len;
+}
+#else
+static void String_encodeWChar(WCharEncoderState* ps, int c) {
+   assert(!ps->buf || ps->pos < ps->size);
+
+   char* buf = ps->buf;
+   if (buf) {
+      buf[ps->pos] = (char)c;
+   }
+
+   ps->pos += 1;
+}
+#endif
+
+void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar) {
+   assert(src || maxLen == 0);
+
+   size_t pos = 0;
+   bool wasReplaced = false;
+
+#ifdef HAVE_LIBNCURSESW
+   const wchar_t replacementChar = CRT_utf8 ? L'\xFFFD' : L'?';
+   wchar_t ch;
+
+   mbstate_t decState;
+   memset(&decState, 0, sizeof(decState));
+#else
+   const char replacementChar = '?';
+   char ch;
+#endif
+
+   do {
+      size_t len = 0;
+      bool shouldReplace = false;
+      ch = 0;
+
+      if (pos < maxLen) {
+         // Read the next character from the byte sequence
+#ifdef HAVE_LIBNCURSESW
+         mbstate_t newState;
+         memcpy(&newState, &decState, sizeof(newState));
+         len = mbrtowc(&ch, &src[pos], maxLen - pos, &newState);
+
+         assert(len != 0 || ch == 0);
+         switch (len) {
+         case (size_t)-2:
+            errno = EILSEQ;
+            shouldReplace = true;
+            len = maxLen - pos;
+            break;
+
+         case (size_t)-1:
+            shouldReplace = true;
+            len = 1;
+            break;
+
+         default:
+            memcpy(&decState, &newState, sizeof(decState));
+         }
+#else
+         len = 1;
+         ch = src[pos];
+#endif
+      }
+
+      pos += len;
+
+      // Filter unprintable characters
+      if (!shouldReplace && ch != 0) {
+#ifdef HAVE_LIBNCURSESW
+         shouldReplace = !iswprint(ch);
+#else
+         shouldReplace = !isprint((unsigned char)ch);
+#endif
+      }
+
+      if (shouldReplace) {
+         ch = replacementChar;
+         if (wasReplaced) {
+            continue;
+         }
+      }
+      wasReplaced = shouldReplace;
+
+      encodeWChar(ps, ch);
+   } while (ch != 0);
+}
+
+char* String_makePrintable(const char* str, size_t maxLen) {
+   WCharEncoderState encState;
+
+   memset(&encState, 0, sizeof(encState));
+   EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
+   size_t size = encState.pos;
+   assert(size > 0);
+
+   memset(&encState, 0, sizeof(encState));
+   char* buf = xMalloc(size);
+   encState.size = size;
+   encState.buf = buf;
+   EncodePrintableString(&encState, str, maxLen, String_encodeWChar);
+   assert(encState.pos == size);
+
+   return buf;
+}
+
+bool String_decodeNextWChar(MBStringDecoderState* ps) {
+   if (!ps->str || ps->maxLen == 0) {
+      return false;
+   }
+
+   // If the previous call of this function encounters an invalid sequence,
+   // do not continue (because the "mbState" object for mbrtowc() is
+   // undefined). The caller is supposed to reset the state.
+#ifdef HAVE_LIBNCURSESW
+   bool isStateDefined = ps->ch != WEOF;
+#else
+   bool isStateDefined = ps->ch != EOF;
+#endif
+   if (!isStateDefined) {
+      return false;
+   }
+
+#ifdef HAVE_LIBNCURSESW
+   wchar_t wc;
+   size_t len = mbrtowc(&wc, ps->str, ps->maxLen, &ps->mbState);
+   switch (len) {
+   case (size_t)-1:
+      // Invalid sequence
+      ps->ch = WEOF;
+      return false;
+
+   case (size_t)-2:
+      // Incomplete sequence
+      ps->str += ps->maxLen;
+      ps->maxLen = 0;
+      return false;
+
+   case 0:
+      assert(wc == 0);
+
+      ps->str = NULL;
+      ps->maxLen = 0;
+      ps->ch = wc;
+      return true;
+
+   default:
+      ps->str += len;
+      ps->maxLen -= len;
+      ps->ch = wc;
+   }
+   return true;
+#else
+   ps->ch = *ps->str;
+   if (ps->ch == 0) {
+      ps->str = NULL;
+      ps->maxLen = 0;
+   } else {
+      ps->str++;
+      ps->maxLen--;
+   }
+   return true;
+#endif
+}
+
+int String_mbswidth(const char** str, size_t maxLen, int maxWidth) {
+   assert(*str || maxLen == 0);
+
+   if (maxWidth < 0)
+      maxWidth = INT_MAX;
+
+#ifdef HAVE_LIBNCURSESW
+   MBStringDecoderState state;
+   memset(&state, 0, sizeof(state));
+   state.str = *str;
+   state.maxLen = maxLen;
+
+   int totalWidth = 0;
+
+   while (String_decodeNextWChar(&state)) {
+      if (state.ch == 0)
+         break;
+
+      int w = wcwidth((wchar_t)state.ch);
+      if (w < 0) {
+         assert(w >= 0);
+         break;
+      }
+
+      if (w > maxWidth - totalWidth)
+         break;
+
+      totalWidth += w;
+
+      // If the character takes zero columns, include the character in the
+      // substring if the working encoding is UTF-8, and ignore it otherwise.
+      // In Unicode, combining characters are always placed after the base
+      // character, but some legacy 8-bit encodings instead place combining
+      // characters before the base character.
+      if (w <= 0 && !CRT_utf8) {
+         continue;
+      }
+
+      // (*str - start) will represent the length of the substring bounded
+      // by the width limit.
+      *str = state.str;
+   }
+
+   assert(state.ch != WEOF);
+   return totalWidth;
+#else
+   maxLen = MINIMUM((unsigned int)maxWidth, maxLen);
+   size_t len = strnlen(*str, maxLen);
+   *str += len;
+   return (int)len;
+#endif
+}
+
 int xAsprintf(char** strp, const char* fmt, ...) {
    va_list vl;
    va_start(vl, fmt);
diff --git a/XUtils.h b/XUtils.h
@@ -23,7 +23,32 @@ in the source distribution for its full text.
 
 #include "Compat.h"
 #include "Macros.h"
+#include "ProvideCurses.h"
+
+
+typedef struct WCharEncoderState_ {
+   size_t pos;
+   size_t size;
+   void* buf;
+   mbstate_t mbState;
+} WCharEncoderState;
+
+typedef struct MBStringDecoderState_ {
+   const char* str;
+   size_t maxLen;
+#ifdef HAVE_LIBNCURSESW
+   wint_t ch;
+   mbstate_t mbState;
+#else
+   int ch;
+#endif
+} MBStringDecoderState;
 
+#ifdef HAVE_LIBNCURSESW
+typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, wchar_t wc);
+#else
+typedef ATTR_NONNULL void (*EncodeWChar)(WCharEncoderState* ps, int c);
+#endif
 
 ATTR_NORETURN
 void fail(void);
@@ -106,6 +131,18 @@ size_t String_safeStrncpy(char* restrict dest, const char* restrict src, size_t
 size_t strnlen(const char* str, size_t maxLen);
 #endif
 
+ATTR_NONNULL_N(1, 4) ATTR_ACCESS2_W(1) ATTR_ACCESS3_R(2, 3)
+void EncodePrintableString(WCharEncoderState* ps, const char* src, size_t maxLen, EncodeWChar encodeWChar);
+
+ATTR_RETNONNULL ATTR_MALLOC ATTR_ACCESS3_R(1, 2)
+char* String_makePrintable(const char* str, size_t maxLen);
+
+ATTR_NONNULL
+bool String_decodeNextWChar(MBStringDecoderState* ps);
+
+ATTR_NONNULL ATTR_ACCESS2_RW(1)
+int String_mbswidth(const char** str, size_t maxLen, int maxWidth);
+
 ATTR_FORMAT(printf, 2, 3) ATTR_NONNULL_N(1, 2)
 int xAsprintf(char** strp, const char* fmt, ...);