Fix %[hl]s field length/precision on printf funcs

Cosmopolitan currently has a completely wrong understanding (according
to the C Standard) of how to handle precision and field length for
wide strings, which have to be handled not by using the
monospace display width but instead by using the outputted byte
width (i.e. the UTF-8 length of the wide string).

This patch corrects this, corrects comments documenting the previous
behavior, adds tests for the behavior and refactors tests that tested
for the previous behavior to correctly use the new correct behavior.
This commit is contained in:
Gabriel Ravier 2024-09-01 00:27:03 +02:00
parent 5b9862907c
commit a779fc7885
4 changed files with 203 additions and 34 deletions

View file

@ -365,19 +365,24 @@ static int __fmt_stoa_byte(out_f out, void *a, uint64_t c) {
return out(buf, a, 1);
}
static size_t __fmt_stoa_get_utf8_size(uint64_t w)
{
return w ? (bsr(w) >> 3) + 1 : 1;
}
static int __fmt_stoa_wide(out_f out, void *a, uint64_t w) {
char buf[8];
if (!isascii(w))
w = tpenc(w);
WRITE64LE(buf, w);
return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
return out(buf, a, __fmt_stoa_get_utf8_size(w));
}
static int __fmt_stoa_bing(out_f out, void *a, uint64_t w) {
char buf[8];
w = tpenc(kCp437[w & 0xFF]);
WRITE64LE(buf, w);
return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
return out(buf, a, __fmt_stoa_get_utf8_size(w));
}
static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
@ -388,7 +393,7 @@ static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
w = tpenc(w);
}
WRITE64LE(buf, w);
return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
return out(buf, a, __fmt_stoa_get_utf8_size(w));
}
/**
@ -410,6 +415,7 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
char *p, buf[1];
unsigned w, pad;
bool justdobytes, ignorenul;
size_t i, next_char_outputted_byte_length, current_outputted_byte_length;
p = data;
if (!p) {
@ -442,13 +448,93 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
emit = __fmt_stoa_byte;
}
/*
The standard specifies precision and field length in terms of bytes, not
characters, despite the confusing mention of "characters" when defining
field length (in the C standard, "character" usually actually means
"byte").
This is made clear in the standard by this example:
> EXAMPLE 2 - In this example, multibyte characters do not have a
> state-dependent encoding, and the members of the extended character set
> that consist of more than one byte each consist of exactly two bytes, the
> first of which is denoted here by a and the second by an uppercase
> letter.
>
> Given the following wide string with length seven,
>
> static wchar_t wstr[] = L"□X□Yabc□Z□W";
>
> the seven calls
>
> fprintf(stdout, "|1234567890123|\n");
> fprintf(stdout, "|%13ls|\n", wstr);
> fprintf(stdout, "|%-13.9ls|\n", wstr);
> fprintf(stdout, "|%13.10ls|\n", wstr);
> fprintf(stdout, "|%13.11ls|\n", wstr);
> fprintf(stdout, "|%13.15ls|\n", &wstr[2]);
> fprintf(stdout, "|%13lc|\n", (wint_t) wstr[5]);
>
> will print the following seven lines:
>
> |1234567890123|
> | XYabcZW|
> |XYabcZ |
> | XYabcZ|
> | XYabcZW|
> | abcZW|
> | Z|
- C Standard, 7.23.5.6.18, The fprintf function
The example explicitly shows that precision and field width are interpreted
as output byte length.
Thus, to determine how many wide characters we must output, we need to
calculate how many bytes we are going to output.
Note that the standard also states that "In no case is a partial multibyte
character written", when specifying the behavior of wide string conversions,
meaning we need to make sure that we stop before any character that would
result in more bytes being outputted than the precision specified is reached
(this is not the case for non-wide string conversions, meaning those may
write partial multibyte characters just fine)
*/
if (!(flags & FLAGS_PRECISION))
precision = -1;
if (!(flags & FLAGS_PRECISION) || !ignorenul) {
current_outputted_byte_length = 0;
if (signbit == 63) {
precision = wcsnlen((const wchar_t *)p, precision);
for (i = 0; i < precision; ++i) {
wc = ((const wchar_t *)p)[i];
if (wc == L'\0')
break;
next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
if (current_outputted_byte_length + next_char_outputted_byte_length >
precision)
break;
current_outputted_byte_length += next_char_outputted_byte_length;
}
precision = current_outputted_byte_length;
} else if (signbit == 15) {
precision = strnlen16((const char16_t *)p, precision);
for (i = 0; i < precision; ++i) {
wc = ((const char16_t *)p)[i];
if (wc == 0)
break;
if (IsUcs2(wc))
next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
else if (IsUtf16Cont(wc)) {
if (i + 1 < precision && ((const char16_t *)p)[i + 1] != 0)
wc = MergeUtf16(wc, ((const char16_t *)p)[i + 1]);
next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
} else {
next_char_outputted_byte_length = 0;
}
if (current_outputted_byte_length + next_char_outputted_byte_length >
precision)
break;
current_outputted_byte_length += next_char_outputted_byte_length;
}
precision = current_outputted_byte_length;
} else {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstringop-overread"
@ -460,13 +546,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
pad = 0;
if (width) {
w = precision;
if (signbit == 63) {
w = wcsnwidth((const wchar_t *)p, precision, 0);
} else if (signbit == 15) {
w = strnwidth16((const char16_t *)p, precision, 0);
} else {
w = strnwidth(p, precision, 0);
}
if (!(flags & FLAGS_NOQUOTE) && (flags & FLAGS_REPR)) {
w += 2 + (signbit == 63) + (signbit == 15);
}
@ -502,20 +581,16 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
return -1;
}
} else {
while (precision--) {
while (precision) {
if (signbit == 15) {
wc = *(const char16_t *)p;
if (!wc && !ignorenul)
break;
if (IsUcs2(wc)) {
p += sizeof(char16_t);
} else if (IsUtf16Cont(wc)) {
} else if (IsUtf16Cont(wc) &&
precision > __fmt_stoa_get_utf8_size(tpenc(wc))) {
p += sizeof(char16_t);
continue;
} else if (!precision) {
break;
} else {
--precision;
wc = MergeUtf16(wc, *(const char16_t *)p);
}
} else if (signbit == 63) {
@ -536,7 +611,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
wc = ThomPikeByte(wc);
if (n > precision)
break;
precision -= n;
while (n--) {
wc = ThomPikeMerge(wc, *p++);
}
@ -544,6 +618,11 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
}
if (emit(out, arg, wc) == -1)
return -1;
next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
if (next_char_outputted_byte_length > precision)
precision = 0;
else
precision -= next_char_outputted_byte_length;
}
}
@ -792,8 +871,8 @@ static int __fmt_noop(const char *, void *, size_t) {
* - `%.*s` supplied byte length argument (obeys nul terminator)
* - ``%`.*s`` supplied byte length argument c escaped (ignores nul term)
* - `%#.*s` supplied byte length argument visualized (ignores nul term)
* - `%.*hs` supplied char16_t length argument (obeys nul terminator)
* - `%.*ls` supplied wchar_t length argument (obeys nul terminator)
* - `%.*hs` supplied output byte length argument (obeys nul terminator)
* - `%.*ls` supplied output byte length argument (obeys nul terminator)
*
* Formatting Modifiers
*
@ -1078,7 +1157,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
p = charbuf;
qchar = '\'';
flags |= FLAGS_PRECISION;
prec = 1;
prec = __fmt_stoa_get_utf8_size(tpenc(charbuf[0]));
goto FormatString;
} else if (flags & (FLAGS_QUOTE | FLAGS_REPR)) {
p = "'\\0'";

View file

@ -33,11 +33,13 @@
* consistent with glibc, musl, and uclibc.
*
* - `%hs` converts UTF-16/UCS-2 UTF-8, which can be helpful on Windows.
* Formatting (e.g. %-10hs) will use monospace display width rather
* than string length or codepoint count.
* Formatting (e.g. %-10hs) will use output byte length (rounded down to
* ensure only full multibyte characters are output) rather than string
* length or codepoint count.
*
* - `%ls` (or `%Ls`) converts UTF-32 UTF-8. Formatting (e.g. %-10ls)
* will use monospace display width rather than string length.
* will use output byte length (rounded down to ensure only full multibyte
* characters are output) rather than string length.
*
* - The `%#s` and `%#c` alternate forms display values using the
* standard IBM standard 256-letter alphabet. Using `%#.*s` to specify

View file

@ -35,3 +35,81 @@ TEST(snprintf, testPlusFlagOnChar) {
ASSERT_EQ(i, 1);
ASSERT_STREQ(buf, "=");
}
TEST(snprintf, testStringWidth) {
char buf[100] = {};
int i = snprintf(buf, sizeof(buf), "<%9ls>", L"éée");
ASSERT_EQ(i, 11);
ASSERT_STREQ(buf, "< éée>");
i = snprintf(buf, sizeof(buf), "<%9s>", "éée");
ASSERT_EQ(i, 11);
ASSERT_STREQ(buf, "< éée>");
i = snprintf(buf, sizeof(buf), "<%9hs>", u"éée");
ASSERT_EQ(i, 11);
ASSERT_STREQ(buf, "< éée>");
}
TEST(snprintf, testStringPrecision) {
char buf[100] = {};
int i = snprintf(buf, sizeof(buf), "%.4ls", L"eeée");
ASSERT_EQ(i, 4);
ASSERT_STREQ(buf, "eeé");
i = snprintf(buf, sizeof(buf), "%.4s", "eeée");
ASSERT_EQ(i, 4);
ASSERT_STREQ(buf, "eeé");
i = snprintf(buf, sizeof(buf), "%.4hs", u"eeée");
ASSERT_EQ(i, 4);
ASSERT_STREQ(buf, "eeé");
}
TEST(snprintf, testStringPrecisionPartialChar) {
char buf[100] = {};
int i = snprintf(buf, sizeof(buf), "%.4ls", L"eéée");
ASSERT_EQ(i, 3);
ASSERT_STREQ(buf, "");
// Note that partial multibyte characters are fine on non-wide conversions
// (whereas they are not written on wide conversions,
// according to the standard)
i = snprintf(buf, sizeof(buf), "%.4s", "eéée");
ASSERT_EQ(i, 4);
ASSERT_STREQ(buf, "\xc3");
i = snprintf(buf, sizeof(buf), "%.4hs", u"eéée");
ASSERT_EQ(i, 3);
ASSERT_STREQ(buf, "");
}
TEST(snprintf, testWideChar) {
char buf[100] = { 'a', 'b', 'c' };
int i = snprintf(buf, sizeof(buf), "%lc", L'\0');
ASSERT_EQ(i, 1);
ASSERT_EQ(buf[0], '\0');
ASSERT_EQ(buf[1], '\0');
ASSERT_EQ(buf[2], 'c');
buf[0] = 'a';
buf[1] = 'b';
i = snprintf(buf, sizeof(buf), "%hc", u'\0');
ASSERT_EQ(i, 1);
ASSERT_EQ(buf[0], '\0');
ASSERT_EQ(buf[1], '\0');
ASSERT_EQ(buf[2], 'c');
i = snprintf(buf, sizeof(buf), "%lc", L'þ');
ASSERT_EQ(i, 2);
ASSERT_STREQ(buf, "þ");
i = snprintf(buf, sizeof(buf), "%hc", u'þ');
ASSERT_EQ(i, 2);
ASSERT_STREQ(buf, "þ");
}

View file

@ -34,23 +34,33 @@ TEST(SUITE(sprintf), testStringLength) {
}
TEST(SUITE(sprintf), testCharacterCounting) {
ASSERT_STREQ(" ♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
ASSERT_STREQ(" ♥♦♣♠☺☻▲", Format(FORMAT("%30"), STRING("♥♦♣♠☺☻▲")));
ASSERT_STREQ("♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
}
TEST(SUITE(snprintf), testTableFlip) {
EXPECT_STREQ("Table flip ", Format("%-20ls", L"Table flip"));
EXPECT_STREQ("(╯°□°)╯︵L┻━┻ ", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ("(╯°□°)╯︵u┻━┻ ", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
EXPECT_STREQ("ちゃぶ台返し ", Format("%-20ls", L"ちゃぶ台返し"));
EXPECT_STREQ(" (╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ(" ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ("(╯°□°)╯︵L┻━┻ ", Format("%-35ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ("(╯°□°)╯︵u┻━┻", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
EXPECT_STREQ("(╯°□°)╯︵u┻━┻ ", Format("%-35hs", u"(╯°□°)╯︵u┻━┻"));
EXPECT_STREQ("ちゃぶ台返し ", Format("%-20ls", L"ちゃぶ台返し"));
EXPECT_STREQ("ちゃぶ台返し ", Format("%-26ls", L"ちゃぶ台返し"));
EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ(" (╯°□°)╯︵L┻━┻", Format("%35ls", L"(╯°□°)╯︵L┻━┻"));
EXPECT_STREQ(" ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
EXPECT_STREQ(" ちゃぶ台返し", Format("%26ls", L"ちゃぶ台返し"));
}
TEST(SUITE(snprintf), testCombiningWidth) {
EXPECT_STREQ("H̲E̲L̲L̲O̲ ",
EXPECT_STREQ("H̲E̲L̲L̲O̲",
Format("%-10ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
EXPECT_STREQ(" H̲E̲L̲L̲O̲",
EXPECT_STREQ("H̲E̲L̲L̲O̲ ",
Format("%-20ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
EXPECT_STREQ("H̲E̲L̲L̲O̲",
Format("%10hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
EXPECT_STREQ(" H̲E̲L̲L̲O̲",
Format("%20hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
}
TEST(SUITE(snprintf), testQuoting) {