This commit is contained in:
wingdeans 2025-06-10 11:12:12 +02:00 committed by GitHub
commit 474fe50f3d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 21 additions and 3 deletions

View file

@ -26,8 +26,8 @@ static const char kUtf8Dispatch[] = {
1, 1, 1, 1, 1, 1, 1, 1, // 0320
1, 1, 1, 1, 1, 1, 1, 1, // 0330
2, 3, 3, 3, 3, 3, 3, 3, // 0340 utf8-3
3, 3, 3, 3, 3, 3, 3, 3, // 0350
4, 5, 5, 5, 5, 0, 0, 0, // 0360 utf8-4
3, 3, 3, 3, 3, 4, 3, 3, // 0350
5, 6, 6, 6, 7, 0, 0, 0, // 0360 utf8-4
0, 0, 0, 0, 0, 0, 0, 0, // 0370
};
@ -95,6 +95,7 @@ bool32 isutf8(const void *data, size_t size) {
}
// fallthrough
case 3:
case_utf8_3:
if (p + 2 <= e && //
(p[0] & 0300) == 0200 && //
(p[1] & 0300) == 0200) { //
@ -104,11 +105,17 @@ bool32 isutf8(const void *data, size_t size) {
return false; // missing cont
}
case 4:
if (p < e && (*p & 040)) {
return false; // utf-16 surrogate
}
goto case_utf8_3;
case 5:
if (p < e && (*p & 0377) < 0220) {
return false; // overlong
}
// fallthrough
case 5:
case 6:
case_utf8_4:
if (p + 3 <= e && //
(((uint32_t)(p[+2] & 0377) << 030 | //
(uint32_t)(p[+1] & 0377) << 020 | //
@ -120,6 +127,11 @@ bool32 isutf8(const void *data, size_t size) {
} else {
return false; // missing cont
}
case 7:
if (p < e && (*p & 0x3F) > 0xF) {
return false; // over limit
}
goto case_utf8_4;
default:
__builtin_unreachable();
}

View file

@ -39,6 +39,9 @@ TEST(isutf8, good) {
"剑号巨阙 珠称夜光 果珍李柰 菜重芥姜 海咸河淡 鳞潜羽翔"
"龙师火帝 鸟官人皇 始制文字 乃服衣裳 推位让国 有虞陶唐",
-1));
EXPECT_TRUE(isutf8("\xf4\x8f\xbf\xbf", -1));
EXPECT_TRUE(isutf8("\xed\x9f\xbf", -1));
EXPECT_TRUE(isutf8("\xee\x80\x80", -1));
}
TEST(isutf8, bad) {
@ -46,6 +49,9 @@ TEST(isutf8, bad) {
ASSERT_FALSE(isutf8("\200\300", -1)); // latin1 c1 control code
ASSERT_FALSE(isutf8("\300\300", -1)); // missing continuation
ASSERT_FALSE(isutf8("\377\200\200\200\200", -1)); // thompson-pike varint
ASSERT_FALSE(isutf8("\xf4\x90\x80\x80", -1)); // over limit
ASSERT_FALSE(isutf8("\xed\xa0\x80", -1));
ASSERT_FALSE(isutf8("\xed\xbf\xbf", -1)); // surrogate pairs
}
TEST(isutf8, oob) {