From 1a1f521082a40a42495ce92edef2e89af1a49c50 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Sat, 6 Dec 2025 10:39:12 -0500 Subject: [PATCH 1/4] Reject overlong forms and out-of-range code points in _Mbrtowc Added UTF-8 validation in _Mbrtowc to reject overlong encodings and out-of-range code points, returning EILSEQ instead of accepting ill-formed sequences. --- stl/src/xmbtowc.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/stl/src/xmbtowc.cpp b/stl/src/xmbtowc.cpp index 2cc7f8766c5..e34df4afa12 100644 --- a/stl/src/xmbtowc.cpp +++ b/stl/src/xmbtowc.cpp @@ -126,6 +126,12 @@ _MRTIMP2 _Success_(return >= 0) int __cdecl _Mbrtowc( } } + // Reject overlong forms and out-of-range code points (see N4950 [locale.codecvt.virtuals]/3) + if ((consumedCount == 2 && wch < 0x80u) || (consumedCount == 3 && wch < 0x800u) || wch > 0x10FFFFu) { + errno = EILSEQ; + return -1; + } + if (wch >= 0xD800u && wch <= 0xDFFFu) { // tried to decode unpaired surrogate errno = EILSEQ; return -1; From 8105e46aa7f91ce2ba12bdd57a2f6353c8b338a9 Mon Sep 17 00:00:00 2001 From: cpplearner Date: Sat, 31 Jan 2026 20:17:33 +0800 Subject: [PATCH 2/4] Add test case --- tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp b/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp index 1e80799d4d8..8d7b57e4b65 100644 --- a/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp +++ b/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp @@ -48,6 +48,9 @@ const encoding_test_char utf_test_cases[] = { // ^^^ TRANSITION, VSO-653059 ^^^ {"\xC2\x61", nullptr}, // Wrong number of trailing bytes {"\xE0\xA0\x61", nullptr}, + + {"\xC0\x80", nullptr}, // overlong form + {"\xC0\x81", nullptr}, }; void assert_empty_file(const wchar_t* const fileName) { From 8cb140a51f3e425cddcb2d8c894d9428a22342d4 Mon Sep 17 00:00:00 2001 From: cpplearner Date: Sat, 31 Jan 2026 20:20:13 +0800 Subject: [PATCH 3/4] Don't need to reject out-of-range values --- stl/src/xmbtowc.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stl/src/xmbtowc.cpp b/stl/src/xmbtowc.cpp index e34df4afa12..236f9f62a5a 100644 --- a/stl/src/xmbtowc.cpp +++ b/stl/src/xmbtowc.cpp @@ -126,8 +126,7 @@ _MRTIMP2 _Success_(return >= 0) int __cdecl _Mbrtowc( } } - // Reject overlong forms and out-of-range code points (see N4950 [locale.codecvt.virtuals]/3) - if ((consumedCount == 2 && wch < 0x80u) || (consumedCount == 3 && wch < 0x800u) || wch > 0x10FFFFu) { + if ((consumedCount == 2 && wch < 0x80u) || (consumedCount == 3 && wch < 0x800u)) { // overlong forms errno = EILSEQ; return -1; } From f6883d6c95c35e0bde6d41b09c5d8218636a3a10 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Sat, 31 Jan 2026 05:46:46 -0800 Subject: [PATCH 4/4] Add test cases. --- tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp b/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp index 8d7b57e4b65..9054e330f0a 100644 --- a/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp +++ b/tests/std/tests/VSO_0644691_utf_8_codecvt/test.cpp @@ -32,7 +32,7 @@ const encoding_test_char utf_test_cases[] = { {"\xED\x9F\xBF", L"\uD7FF"}, // U+D7FF unencoded from the Hangul Jamo Extended-B block, // last character before UTF-16 illegal region {"\xED\xA0\x80", nullptr}, // U+D800 beginning of surrogate range - {nullptr, L"\xD800"}, + {nullptr, L"\xD800"}, // {"\xED\xBF\xBF", nullptr}, // U+DFFF end of surrogate range {nullptr, L"\xDFFF"}, {"\xEE\x80\x80", L"\uE000"}, // U+E000 unencoded from the private use area, first codepoint after surrogate range @@ -49,8 +49,16 @@ const encoding_test_char utf_test_cases[] = { {"\xC2\x61", nullptr}, // Wrong number of trailing bytes {"\xE0\xA0\x61", nullptr}, - {"\xC0\x80", nullptr}, // overlong form - {"\xC0\x81", nullptr}, + {"\xC0\x80", nullptr}, // overlong form: 2 bytes for U+0000 NULL + {"\xC0\x81", nullptr}, // overlong form: 2 bytes for U+0001 START OF HEADING + + {"\xC1\xBF", nullptr}, // overlong form: 2 bytes for U+007F DELETE + {"\xE0\x81\xBF", nullptr}, // overlong form: 3 bytes for U+007F DELETE + {"\xF0\x80\x81\xBF", nullptr}, // overlong form: 4 bytes for U+007F DELETE + + {"\xD0\xAF", L"\u042F"}, // U+042F CYRILLIC CAPITAL LETTER YA + {"\xE0\x90\xAF", nullptr}, // overlong form: 3 bytes for U+042F CYRILLIC CAPITAL LETTER YA + {"\xF0\x80\x90\xAF", nullptr}, // overlong form: 4 bytes for U+042F CYRILLIC CAPITAL LETTER YA }; void assert_empty_file(const wchar_t* const fileName) {