Skip to content

Commit 47d7686

Browse files
authoredJan 28, 2025··
Merge pull request #67 from zao/feat/wide-path-io-v2
feat: change all narrow filesystem paths and operations to speak UTF-8
2 parents 1443976 + 6be0f31 commit 47d7686

37 files changed

+1387
-457
lines changed
 

‎.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@
1010
[submodule "dep/glm"]
1111
path = dep/glm
1212
url = https://github.com/g-truc/glm.git
13+
[submodule "libs/luautf8"]
14+
path = libs/luautf8
15+
url = https://github.com/starwing/luautf8.git

‎CMakeLists.txt

+20
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,26 @@ target_link_libraries(lcurl
271271
install(TARGETS lcurl RUNTIME DESTINATION ".")
272272
install(FILES $<TARGET_RUNTIME_DLLS:lcurl> DESTINATION ".")
273273

274+
# luautf8 module
275+
276+
add_library(lua-utf8 SHARED libs/luautf8/lutf8lib.c)
277+
278+
target_compile_definitions(lua-utf8
279+
PRIVATE
280+
LUA_BUILD_AS_DLL
281+
)
282+
283+
target_include_directories(lua-utf8
284+
PRIVATE
285+
)
286+
287+
target_link_libraries(lua-utf8
288+
PRIVATE
289+
LuaJIT::LuaJIT
290+
)
291+
292+
install(TARGETS lua-utf8 RUNTIME DESTINATION ".")
293+
install(FILES $<TARGET_RUNTIME_DLLS:lua-utf8> DESTINATION ".")
274294

275295
# lzip module
276296

‎engine/common.h

+27-2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
#include "common/memtrak3.h"
2727
#endif
2828

29+
#include <string>
30+
#include <string_view>
31+
#include <vector>
32+
2933
// =======
3034
// Classes
3135
// =======
@@ -475,15 +479,36 @@ T clamp(T &v, T l, T u)
475479
// Common Functions
476480
// ================
477481

478-
int IsColorEscape(const char* str);
479-
void ReadColorEscape(const char* str, col3_t out);
482+
int IsColorEscape(char const* str);
483+
int IsColorEscape(std::u32string_view str);
484+
void ReadColorEscape(char const* str, col3_t out);
485+
std::u32string_view ReadColorEscape(std::u32string_view str, col3_t out);
480486

481487
char* _AllocString(const char* str, const char* file, int line);
482488
#define AllocString(s) _AllocString(s, __FILE__, __LINE__)
483489
char* _AllocStringLen(size_t len, const char* file, int line);
484490
#define AllocStringLen(s) _AllocStringLen(s, __FILE__, __LINE__)
485491
void FreeString(const char* str);
486492
dword StringHash(const char* str, int mask);
493+
dword StringHash(std::string_view str, int mask);
494+
495+
struct IndexedUTF32String {
496+
std::u32string text;
497+
std::vector<size_t> sourceCodeUnitOffsets;
498+
};
499+
500+
IndexedUTF32String IndexUTF8ToUTF32(std::string_view str);
501+
502+
#ifdef _WIN32
503+
wchar_t* WidenANSIString(const char* str);
504+
wchar_t* WidenOEMString(const char* str);
505+
wchar_t* WidenUTF8String(const char* str);
506+
void FreeWideString(wchar_t* str);
507+
508+
char* NarrowANSIString(const wchar_t* str);
509+
char* NarrowOEMString(const wchar_t* str);
510+
char* NarrowUTF8String(const wchar_t* str);
511+
#endif
487512

488513
#ifndef _WIN32
489514
#define _stricmp strcasecmp

‎engine/common/common.cpp

+219-3
Original file line numberDiff line numberDiff line change
@@ -205,9 +205,10 @@ int IsColorEscape(const char* str)
205205
}
206206
if (isdigit(str[1])) {
207207
return 2;
208-
} else if (str[1] == 'x' || str[1] == 'X') {
208+
}
209+
else if (str[1] == 'x' || str[1] == 'X') {
209210
for (int c = 0; c < 6; c++) {
210-
if ( !isxdigit(str[c + 2]) ) {
211+
if (!isxdigit(str[c + 2])) {
211212
return 0;
212213
}
213214
}
@@ -216,23 +217,77 @@ int IsColorEscape(const char* str)
216217
return 0;
217218
}
218219

220+
int IsColorEscape(std::u32string_view str)
221+
{
222+
if (str.size() < 2 || str[0] != '^') {
223+
return 0;
224+
}
225+
226+
auto discrim = str[1];
227+
228+
// Check for indexed colour escape like ^7.
229+
// Avoid using isdigit as we only accept arabic numerals.
230+
if (discrim >= U'0' && discrim <= U'9') {
231+
return 2;
232+
}
233+
234+
// Check for direct colour escape like ^x123ABC.
235+
if (str.size() >= 8 && (discrim == 'x' || discrim == 'X')) {
236+
for (int c = 0; c < 6; c++) {
237+
auto ch = str[c + 2];
238+
bool const isHexDigit = (ch >= U'0' && ch <= U'9') || (ch >= U'A' && ch <= U'F') || (ch >= U'a' && ch <= U'f');
239+
if (!isHexDigit) {
240+
return 0;
241+
}
242+
}
243+
return 8;
244+
}
245+
246+
// Fallthrough indicates no recognized colour code.
247+
return 0;
248+
}
249+
219250
void ReadColorEscape(const char* str, col3_t out)
220251
{
221252
int len = IsColorEscape(str);
222253
switch (len) {
223254
case 2:
224255
VectorCopy(colorEscape[str[1] - '0'], out);
225256
break;
257+
case 8:
258+
{
259+
int xr, xg, xb;
260+
sscanf(str + 2, "%2x%2x%2x", &xr, &xg, &xb);
261+
out[0] = xr / 255.0f;
262+
out[1] = xg / 255.0f;
263+
out[2] = xb / 255.0f;
264+
}
265+
break;
266+
}
267+
}
268+
269+
std::u32string_view ReadColorEscape(std::u32string_view str, col3_t out)
270+
{
271+
int len = IsColorEscape(str);
272+
switch (len) {
273+
case 2:
274+
VectorCopy(colorEscape[str[1] - U'0'], out);
275+
break;
226276
case 8:
227277
{
228278
int xr, xg, xb;
229-
sscanf(str + 2, "%2x%2x%2x", &xr, &xg, &xb);
279+
char buf[7]{};
280+
for (size_t i = 0; i < 6; ++i) {
281+
buf[i] = (char)str[i + 2];
282+
}
283+
sscanf(buf, "%2x%2x%2x", &xr, &xg, &xb);
230284
out[0] = xr / 255.0f;
231285
out[1] = xg / 255.0f;
232286
out[2] = xb / 255.0f;
233287
}
234288
break;
235289
}
290+
return str.substr(len);
236291
}
237292

238293
// ================
@@ -279,3 +334,164 @@ dword StringHash(const char* str, int mask)
279334
}
280335
return hash & mask;
281336
}
337+
338+
dword StringHash(std::string_view str, int mask)
339+
{
340+
size_t len = str.length();
341+
dword hash = 0;
342+
for (size_t i = 0; i < len; i++) {
343+
hash += (str[i] * 4999) ^ (((dword)i + 17) * 2003);
344+
}
345+
return hash & mask;
346+
}
347+
348+
#ifdef _WIN32
349+
#include <Windows.h>
350+
351+
static wchar_t* WidenCodepageString(const char* str, UINT codepage)
352+
{
353+
if (!str) {
354+
return nullptr;
355+
}
356+
// Early-out if empty, avoids ambigious error return from MBTWC.
357+
if (!*str) {
358+
wchar_t* wstr = new wchar_t[1];
359+
*wstr = L'\0';
360+
return wstr;
361+
}
362+
DWORD cb = (DWORD)strlen(str);
363+
int cch = MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, str, cb, nullptr, 0);
364+
if (cch == 0) {
365+
// Invalid string or other error.
366+
return nullptr;
367+
}
368+
wchar_t* wstr = new wchar_t[cch + 1]; // sized MBTWC doesn't include terminator.
369+
MultiByteToWideChar(codepage, 0, str, cb, wstr, cch);
370+
wstr[cch] = '\0';
371+
return wstr;
372+
}
373+
374+
wchar_t* WidenANSIString(const char* str)
375+
{
376+
return WidenCodepageString(str, CP_ACP);
377+
}
378+
379+
wchar_t* WidenOEMString(const char* str)
380+
{
381+
return WidenCodepageString(str, CP_OEMCP);
382+
}
383+
384+
wchar_t* WidenUTF8String(const char* str)
385+
{
386+
return WidenCodepageString(str, CP_UTF8);
387+
}
388+
389+
char* NarrowCodepageString(const wchar_t* str, UINT codepage)
390+
{
391+
if (!str) {
392+
return nullptr;
393+
}
394+
if (!*str) {
395+
char* nstr = new char[1];
396+
*nstr = '\0';
397+
return nstr;
398+
}
399+
DWORD cch = (DWORD)wcslen(str);
400+
int cb = WideCharToMultiByte(codepage, 0, str, cch, nullptr, 0, nullptr, nullptr);
401+
if (cb == 0) {
402+
// Invalid string or other error.
403+
return nullptr;
404+
}
405+
char* nstr = new char[cb + 1];
406+
WideCharToMultiByte(codepage, 0, str, cch, nstr, cb, nullptr, nullptr);
407+
nstr[cb] = '\0';
408+
return nstr;
409+
}
410+
411+
void FreeWideString(wchar_t* str)
412+
{
413+
if (str) {
414+
delete[] str;
415+
}
416+
}
417+
418+
char* NarrowANSIString(const wchar_t* str)
419+
{
420+
return NarrowCodepageString(str, CP_ACP);
421+
}
422+
423+
char* NarrowOEMString(const wchar_t* str)
424+
{
425+
return NarrowCodepageString(str, CP_OEMCP);
426+
}
427+
428+
char* NarrowUTF8String(const wchar_t* str)
429+
{
430+
return NarrowCodepageString(str, CP_UTF8);
431+
}
432+
433+
IndexedUTF32String IndexUTF8ToUTF32(std::string_view input)
434+
{
435+
IndexedUTF32String ret{};
436+
437+
size_t byteCount = input.size();
438+
auto& offsets = ret.sourceCodeUnitOffsets;
439+
offsets.reserve(byteCount); // conservative reservation
440+
std::vector<char32_t> codepoints;
441+
442+
auto bytes = (uint8_t const*)input.data();
443+
for (size_t byteIdx = 0; byteIdx < byteCount;) {
444+
uint8_t const* b = bytes + byteIdx;
445+
size_t left = byteCount - byteIdx;
446+
offsets.push_back(byteIdx);
447+
448+
char32_t codepoint{};
449+
if (*b >> 7 == 0b0) { // 0xxx'xxxx
450+
codepoint = *b;
451+
byteIdx += 1;
452+
}
453+
else if (left >= 2 &&
454+
b[0] >> 5 == 0b110 &&
455+
b[1] >> 6 == 0b10)
456+
{
457+
auto p0 = (uint32_t)b[0] & 0b1'1111;
458+
auto p1 = (uint32_t)b[1] & 0b11'1111;
459+
codepoint = p0 << 6 | p1;
460+
byteIdx += 2;
461+
}
462+
else if (left >= 3 &&
463+
b[0] >> 4 == 0b1110 &&
464+
b[1] >> 6 == 0b10 &&
465+
b[2] >> 6 == 0b10)
466+
{
467+
auto p0 = (uint32_t)b[0] & 0b1111;
468+
auto p1 = (uint32_t)b[1] & 0b11'1111;
469+
auto p2 = (uint32_t)b[2] & 0b11'1111;
470+
codepoint = p0 << 12 | p1 << 6 | p2;
471+
byteIdx += 3;
472+
}
473+
else if (left >= 4 &&
474+
b[0] >> 3 == 0b11110 &&
475+
b[1] >> 6 == 0b10 &&
476+
b[2] >> 6 == 0b10 &&
477+
b[3] >> 6 == 0b10)
478+
{
479+
auto p0 = (uint32_t)b[0] & 0b111;
480+
auto p1 = (uint32_t)b[1] & 0b11'1111;
481+
auto p2 = (uint32_t)b[2] & 0b11'1111;
482+
auto p3 = (uint32_t)b[2] & 0b11'1111;
483+
codepoint = p0 << 18 | p1 << 12 | p2 << 6 | p3;
484+
byteIdx += 4;
485+
}
486+
else {
487+
codepoints.push_back(0xFFFDu);
488+
byteIdx += 1;
489+
}
490+
codepoints.push_back(codepoint);
491+
}
492+
493+
ret.text = std::u32string(codepoints.begin(), codepoints.end());
494+
return ret;
495+
}
496+
497+
#endif

‎engine/common/streams.cpp

+12-4
Original file line numberDiff line numberDiff line change
@@ -255,10 +255,14 @@ bool fileInputStream_c::Read(void* out, size_t len)
255255
return fread(out, len, 1, file) < 1;
256256
}
257257

258-
bool fileInputStream_c::FileOpen(const char* fileName, bool binary)
258+
bool fileInputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)
259259
{
260260
FileClose();
261-
file = fopen(fileName, binary? "rb" : "r");
261+
#ifdef _WIN32
262+
file = _wfopen(fileName.c_str(), binary ? L"rb" : L"r");
263+
#else
264+
file = fopen(fileName.c_str(), binary ? "rb" : "r");
265+
#endif
262266
if ( !file ) {
263267
return true;
264268
}
@@ -277,10 +281,14 @@ bool fileOutputStream_c::Write(const void* in, size_t len)
277281
return fwrite(in, len, 1, file) < 1;
278282
}
279283

280-
bool fileOutputStream_c::FileOpen(const char* fileName, bool binary)
284+
bool fileOutputStream_c::FileOpen(std::filesystem::path const& fileName, bool binary)
281285
{
282286
FileClose();
283-
file = fopen(fileName, binary? "wb" : "w");
287+
#ifdef _WIN32
288+
file = _wfopen(fileName.c_str(), binary ? L"wb" : L"w");
289+
#else
290+
file = fopen(fileName.c_str(), binary ? "wb" : "w");
291+
#endif
284292
if ( !file ) {
285293
return true;
286294
}

0 commit comments

Comments
 (0)
Please sign in to comment.