PathOfBuildingCommunity · Jan 28, 2025
diff --git a/‎.gitmodules
+3 b/‎.gitmodules
+3
diff --git a/‎CMakeLists.txt
+20 b/‎CMakeLists.txt
+20
diff --git a/‎engine/common.h
+27-2 b/‎engine/common.h
+27-2
diff --git a/‎engine/common/common.cpp
+219-3 b/‎engine/common/common.cpp
+219-3
diff --git a/‎engine/common/streams.cpp
+12-4 b/‎engine/common/streams.cpp
+12-4
@@ -10,3 +10,6 @@
 [submodule "dep/glm"]
 	path = dep/glm
 	url = https://github.com/g-truc/glm.git
+[submodule "libs/luautf8"]
+	path = libs/luautf8
+	url = https://github.com/starwing/luautf8.git
@@ -271,6 +271,26 @@ target_link_libraries(lcurl
 install(TARGETS lcurl RUNTIME DESTINATION ".")
 install(FILES $<TARGET_RUNTIME_DLLS:lcurl> DESTINATION ".")
 
+# luautf8 module
+
+add_library(lua-utf8 SHARED libs/luautf8/lutf8lib.c)
+
+target_compile_definitions(lua-utf8
+    PRIVATE
+    LUA_BUILD_AS_DLL
+)
+
+target_include_directories(lua-utf8
+    PRIVATE
+)
+
+target_link_libraries(lua-utf8
+    PRIVATE
+    LuaJIT::LuaJIT
+)
+
+install(TARGETS lua-utf8 RUNTIME DESTINATION ".")
+install(FILES $<TARGET_RUNTIME_DLLS:lua-utf8> DESTINATION ".")
 
 # lzip module
 
 
@@ -26,6 +26,10 @@
 #include "common/memtrak3.h"
 #endif
 
+#include <string>
+#include <string_view>
+#include <vector>
+
 // =======
 // Classes
 // =======
@@ -475,15 +479,36 @@ T clamp(T &v, T l, T u)
 // Common Functions
 // ================
 
-int		IsColorEscape(const char* str);
-void	ReadColorEscape(const char* str, col3_t out);
+int		IsColorEscape(char const* str);
+int		IsColorEscape(std::u32string_view str);
+void	ReadColorEscape(char const* str, col3_t out);
+std::u32string_view ReadColorEscape(std::u32string_view str, col3_t out);
 
 char*	_AllocString(const char* str, const char* file, int line);
 #define AllocString(s) _AllocString(s, __FILE__, __LINE__)
 char*	_AllocStringLen(size_t len, const char* file, int line);
 #define AllocStringLen(s) _AllocStringLen(s, __FILE__, __LINE__)
 void	FreeString(const char* str);
 dword	StringHash(const char* str, int mask);
+dword	StringHash(std::string_view str, int mask);
+
+struct IndexedUTF32String {
+	std::u32string text;
+	std::vector<size_t> sourceCodeUnitOffsets;
+};
+
+IndexedUTF32String IndexUTF8ToUTF32(std::string_view str);
+
+#ifdef _WIN32
+wchar_t* WidenANSIString(const char* str);
+wchar_t* WidenOEMString(const char* str);
+wchar_t* WidenUTF8String(const char* str);
+void FreeWideString(wchar_t* str);
+
+char* NarrowANSIString(const wchar_t* str);
+char* NarrowOEMString(const wchar_t* str);
+char* NarrowUTF8String(const wchar_t* str);
+#endif
 
 #ifndef _WIN32
 #define _stricmp strcasecmp
 
@@ -205,9 +205,10 @@ int IsColorEscape(const char* str)
 	}
 	if (isdigit(str[1])) {
 		return 2;
-	} else if (str[1] == 'x' || str[1] == 'X') {
+	}
+	else if (str[1] == 'x' || str[1] == 'X') {
 		for (int c = 0; c < 6; c++) {
-			if ( !isxdigit(str[c + 2]) ) {
+			if (!isxdigit(str[c + 2])) {
 				return 0;
 			}
 		}
@@ -216,23 +217,77 @@ int IsColorEscape(const char* str)
 	return 0;
 }
 
+int IsColorEscape(std::u32string_view str)
+{
+	if (str.size() < 2 || str[0] != '^') {
+		return 0;
+	}
+
+	auto discrim = str[1];
+
+	// Check for indexed colour escape like ^7.
+	// Avoid using isdigit as we only accept arabic numerals.
+	if (discrim >= U'0' && discrim <= U'9') {
+		return 2;
+	}
+
+	// Check for direct colour escape like ^x123ABC.
+	if (str.size() >= 8 && (discrim == 'x' || discrim == 'X')) {
+		for (int c = 0; c < 6; c++) {
+			auto ch = str[c + 2];
+			bool const isHexDigit = (ch >= U'0' && ch <= U'9') || (ch >= U'A' && ch <= U'F') || (ch >= U'a' && ch <= U'f');
+			if (!isHexDigit) {
+				return 0;
+			}
+		}
+		return 8;
+	}
+
+	// Fallthrough indicates no recognized colour code.
+	return 0;
+}
+
 void ReadColorEscape(const char* str, col3_t out)
 {
 	int len = IsColorEscape(str);
 	switch (len) {
 	case 2:
 		VectorCopy(colorEscape[str[1] - '0'], out);
 		break;
+	case 8:
+	{
+		int xr, xg, xb;
+		sscanf(str + 2, "%2x%2x%2x", &xr, &xg, &xb);
+		out[0] = xr / 255.0f;
+		out[1] = xg / 255.0f;
+		out[2] = xb / 255.0f;
+	}
+	break;
+	}
+}
+
+std::u32string_view ReadColorEscape(std::u32string_view str, col3_t out)
+{
+	int len = IsColorEscape(str);
+	switch (len) {
+	case 2:
+		VectorCopy(colorEscape[str[1] - U'0'], out);
+		break;
 	case 8:
 		{
 			int xr, xg, xb;
-			sscanf(str + 2, "%2x%2x%2x", &xr, &xg, &xb);
+			char buf[7]{};
+			for (size_t i = 0; i < 6; ++i) {
+				buf[i] = (char)str[i + 2];
+			}
+			sscanf(buf, "%2x%2x%2x", &xr, &xg, &xb);
 			out[0] = xr / 255.0f;
 			out[1] = xg / 255.0f;
 			out[2] = xb / 255.0f;
 		}
 		break;
 	}
+	return str.substr(len);
 }
 
 // ================
@@ -279,3 +334,164 @@ dword StringHash(const char* str, int mask)
 	}
 	return hash & mask;
 }
+
+dword StringHash(std::string_view str, int mask)
+{
+	size_t len = str.length();
+	dword hash = 0;
+	for (size_t i = 0; i < len; i++) {
+		hash += (str[i] * 4999) ^ (((dword)i + 17) * 2003);
+	}
+	return hash & mask;
+}
+
+#ifdef _WIN32
+#include <Windows.h>
+
+static wchar_t* WidenCodepageString(const char* str, UINT codepage)
+{
+	if (!str) {
+		return nullptr;
+	}
+	// Early-out if empty, avoids ambigious error return from MBTWC.
+	if (!*str) {
+		wchar_t* wstr = new wchar_t[1];
+		*wstr = L'\0';
+		return wstr;
+	}
+	DWORD cb = (DWORD)strlen(str);
+	int cch = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, str, cb, nullptr, 0);
+	if (cch == 0) {
+		// Invalid string or other error.
+		return nullptr;
+	}
+	wchar_t* wstr = new wchar_t[cch + 1]; // sized MBTWC doesn't include terminator.
+	MultiByteToWideChar(codepage, 0, str, cb, wstr, cch);
+	wstr[cch] = '\0';
+	return wstr;
+}
+
+wchar_t* WidenANSIString(const char* str)
+{
+	return WidenCodepageString(str, CP_ACP);
+}
+
+wchar_t* WidenOEMString(const char* str)
+{
+	return WidenCodepageString(str, CP_OEMCP);
+}
+
+wchar_t* WidenUTF8String(const char* str)
+{
+	return WidenCodepageString(str, CP_UTF8);
+}
+
+char* NarrowCodepageString(const wchar_t* str, UINT codepage)
+{
+	if (!str) {
+		return nullptr;
+	}
+	if (!*str) {
+		char* nstr = new char[1];
+		*nstr = '\0';
+		return nstr;
+	}
+	DWORD cch = (DWORD)wcslen(str);
+	int cb = WideCharToMultiByte(codepage, 0, str, cch, nullptr, 0, nullptr, nullptr);
+	if (cb == 0) {
+		// Invalid string or other error.
+		return nullptr;
+	}
+	char* nstr = new char[cb + 1];
+	WideCharToMultiByte(codepage, 0, str, cch, nstr, cb, nullptr, nullptr);
+	nstr[cb] = '\0';
+	return nstr;
+}
+
+void FreeWideString(wchar_t* str)
+{
+	if (str) {
+		delete[] str;
+	}
+}
+
+char* NarrowANSIString(const wchar_t* str)
+{
+	return NarrowCodepageString(str, CP_ACP);
+}
+
+char* NarrowOEMString(const wchar_t* str)
+{
+	return NarrowCodepageString(str, CP_OEMCP);
+}
+
+char* NarrowUTF8String(const wchar_t* str)
+{
+	return NarrowCodepageString(str, CP_UTF8);
+}
+
+IndexedUTF32String IndexUTF8ToUTF32(std::string_view input)
+{
+	IndexedUTF32String ret{};
+
+	size_t byteCount = input.size();
+	auto& offsets = ret.sourceCodeUnitOffsets;
+	offsets.reserve(byteCount); // conservative reservation
+	std::vector<char32_t> codepoints;
+
+	auto bytes = (uint8_t const*)input.data();
+	for (size_t byteIdx = 0; byteIdx < byteCount;) {
+		uint8_t const* b = bytes + byteIdx;
+		size_t left = byteCount - byteIdx;
+		offsets.push_back(byteIdx);
+
+		char32_t codepoint{};
+		if (*b >> 7 == 0b0) { // 0xxx'xxxx
+			codepoint = *b;
+			byteIdx += 1;
+		}
+		else if (left >= 2 &&
+			b[0] >> 5 == 0b110 &&
+			b[1] >> 6 == 0b10)
+		{
+			auto p0 = (uint32_t)b[0] & 0b1'1111;
+			auto p1 = (uint32_t)b[1] & 0b11'1111;
+			codepoint = p0 << 6 | p1;
+			byteIdx += 2;
+		}
+		else if (left >= 3 &&
+			b[0] >> 4 == 0b1110 &&
+			b[1] >> 6 == 0b10 &&
+			b[2] >> 6 == 0b10)
+		{
+			auto p0 = (uint32_t)b[0] & 0b1111;
+			auto p1 = (uint32_t)b[1] & 0b11'1111;
+			auto p2 = (uint32_t)b[2] & 0b11'1111;
+			codepoint = p0 << 12 | p1 << 6 | p2;
+			byteIdx += 3;
+		}
+		else if (left >= 4 &&
+			b[0] >> 3 == 0b11110 &&
+			b[1] >> 6 == 0b10 &&
+			b[2] >> 6 == 0b10 &&
+			b[3] >> 6 == 0b10)
+		{
+			auto p0 = (uint32_t)b[0] & 0b111;
+			auto p1 = (uint32_t)b[1] & 0b11'1111;
+			auto p2 = (uint32_t)b[2] & 0b11'1111;
+			auto p3 = (uint32_t)b[2] & 0b11'1111;
+			codepoint = p0 << 18 | p1 << 12 | p2 << 6 | p3;
+			byteIdx += 4;
+		}
+		else {
+			codepoints.push_back(0xFFFDu);
+			byteIdx += 1;
+		}
+		codepoints.push_back(codepoint);
+	}
+
+	ret.text = std::u32string(codepoints.begin(), codepoints.end());
+	return ret;
+}
+
+#endif
@@ -255,10 +255,14 @@ bool fileInputStream_c::Read(void* out, size_t len)
 	return fread(out, len, 1, file) < 1;
 }
 
-bool fileInputStream_c::FileOpen(const char* fileName, bool binary)
+bool fileInputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)
 {
 	FileClose();
-	file = fopen(fileName, binary? "rb" : "r");
+#ifdef _WIN32
+	file = _wfopen(fileName.c_str(), binary ? L"rb" : L"r");
+#else
+	file = fopen(fileName.c_str(), binary ? "rb" : "r");
+#endif
 	if ( !file ) {
 		return true;
 	}
@@ -277,10 +281,14 @@ bool fileOutputStream_c::Write(const void* in, size_t len)
 	return fwrite(in, len, 1, file) < 1;
 }
 
-bool fileOutputStream_c::FileOpen(const char* fileName, bool binary)
+bool fileOutputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)
 {
 	FileClose();
-	file = fopen(fileName, binary? "wb" : "w");
+#ifdef _WIN32
+	file = _wfopen(fileName.c_str(), binary ? L"wb" : L"w");
+#else
+	file = fopen(fileName.c_str(), binary ? "wb" : "w");
+#endif
 	if ( !file ) {
 		return true;
 	}
Original file line number	Diff line number	Diff line change
`@@ -255,10 +255,14 @@ bool fileInputStream_c::Read(void* out, size_t len)`
`255`	`255`	`return fread(out, len, 1, file) < 1;`
`256`	`256`	`}`
`257`	`257`
`258`		`-bool fileInputStream_c::FileOpen(const char* fileName, bool binary)`
	`258`	`+bool fileInputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)`
`259`	`259`	`{`
`260`	`260`	`FileClose();`
`261`		`- file = fopen(fileName, binary? "rb" : "r");`
	`261`	`+#ifdef _WIN32`
	`262`	`+ file = _wfopen(fileName.c_str(), binary ? L"rb" : L"r");`
	`263`	`+#else`
	`264`	`+ file = fopen(fileName.c_str(), binary ? "rb" : "r");`
	`265`	`+#endif`
`262`	`266`	`if ( !file ) {`
`263`	`267`	`return true;`
`264`	`268`	`}`
`@@ -277,10 +281,14 @@ bool fileOutputStream_c::Write(const void* in, size_t len)`
`277`	`281`	`return fwrite(in, len, 1, file) < 1;`
`278`	`282`	`}`
`279`	`283`
`280`		`-bool fileOutputStream_c::FileOpen(const char* fileName, bool binary)`
	`284`	`+bool fileOutputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)`
`281`	`285`	`{`
`282`	`286`	`FileClose();`
`283`		`- file = fopen(fileName, binary? "wb" : "w");`
	`287`	`+#ifdef _WIN32`
	`288`	`+ file = _wfopen(fileName.c_str(), binary ? L"wb" : L"w");`
	`289`	`+#else`
	`290`	`+ file = fopen(fileName.c_str(), binary ? "wb" : "w");`
	`291`	`+#endif`
`284`	`292`	`if ( !file ) {`
`285`	`293`	`return true;`
`286`	`294`	`}`