Fix %[hl]s field length/precision on printf funcs

Cosmopolitan currently has a completely wrong understanding (according to the C Standard) of how to handle precision and field length for wide strings, which have to be handled not by using the monospace display width but instead by using the outputted byte width (i.e. the UTF-8 length of the wide string). This patch corrects this, corrects comments documenting the previous behavior, adds tests for the behavior and refactors tests that tested for the previous behavior to correctly use the new correct behavior.
2025-08-03 08:20:28 +00:00 · 2024-09-01 00:27:03 +02:00 · 2024-09-01 00:27:03 +02:00 · a779fc7885
commit a779fc7885
parent 5b9862907c
4 changed files with 203 additions and 34 deletions
--- a/libc/stdio/fmt.c
+++ b/libc/stdio/fmt.c
@ -365,19 +365,24 @@ static int __fmt_stoa_byte(out_f out, void *a, uint64_t c) {
  return out(buf, a, 1);
 }

+static size_t __fmt_stoa_get_utf8_size(uint64_t w)
+{
+  return w ? (bsr(w) >> 3) + 1 : 1;
+}
+
 static int __fmt_stoa_wide(out_f out, void *a, uint64_t w) {
  char buf[8];
  if (!isascii(w))
    w = tpenc(w);
  WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }

 static int __fmt_stoa_bing(out_f out, void *a, uint64_t w) {
  char buf[8];
  w = tpenc(kCp437[w & 0xFF]);
  WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }

 static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
@ -388,7 +393,7 @@ static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
    w = tpenc(w);
  }
  WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }

 /**
@ -410,6 +415,7 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
  char *p, buf[1];
  unsigned w, pad;
  bool justdobytes, ignorenul;
+  size_t i, next_char_outputted_byte_length, current_outputted_byte_length;

  p = data;
  if (!p) {
@ -442,13 +448,93 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
    emit = __fmt_stoa_byte;
  }

+  /*
+    The standard specifies precision and field length in terms of bytes, not
+    characters, despite the confusing mention of "characters" when defining
+    field length (in the C standard, "character" usually actually means
+    "byte").
+
+    This is made clear in the standard by this example:
+    > EXAMPLE 2 - In this example, multibyte characters do not have a
+    > state-dependent encoding, and the members of the extended character set
+    > that consist of more than one byte each consist of exactly two bytes, the
+    > first of which is denoted here by a □ and the second by an uppercase
+    > letter.
+    >
+    > Given the following wide string with length seven,
+    >
+    >     static wchar_t wstr[] = L"□X□Yabc□Z□W";
+    >
+    > the seven calls
+    >
+    >     fprintf(stdout, "|1234567890123|\n");
+    >     fprintf(stdout, "|%13ls|\n", wstr);
+    >     fprintf(stdout, "|%-13.9ls|\n", wstr);
+    >     fprintf(stdout, "|%13.10ls|\n", wstr);
+    >     fprintf(stdout, "|%13.11ls|\n", wstr);
+    >     fprintf(stdout, "|%13.15ls|\n", &wstr[2]);
+    >     fprintf(stdout, "|%13lc|\n", (wint_t) wstr[5]);
+    >
+    > will print the following seven lines:
+    >
+    >     |1234567890123|
+    >     |  □X□Yabc□Z□W|
+    >     |□X□Yabc□Z    |
+    >     |    □X□Yabc□Z|
+    >     |  □X□Yabc□Z□W|
+    >     |      abc□Z□W|
+    >     |           □Z|
+    - C Standard, 7.23.5.6.18, The fprintf function
+
+    The example explicitly shows that precision and field width are interpreted
+    as output byte length.
+
+    Thus, to determine how many wide characters we must output, we need to
+    calculate how many bytes we are going to output.
+
+    Note that the standard also states that "In no case is a partial multibyte
+    character written", when specifying the behavior of wide string conversions,
+    meaning we need to make sure that we stop before any character that would
+    result in more bytes being outputted than the precision specified is reached
+    (this is not the case for non-wide string conversions, meaning those may
+    write partial multibyte characters just fine)
+  */
  if (!(flags & FLAGS_PRECISION))
    precision = -1;
  if (!(flags & FLAGS_PRECISION) || !ignorenul) {
+    current_outputted_byte_length = 0;
    if (signbit == 63) {
-      precision = wcsnlen((const wchar_t *)p, precision);
+      for (i = 0; i < precision; ++i) {
+        wc = ((const wchar_t *)p)[i];
+        if (wc == L'\0')
+          break;
+        next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        if (current_outputted_byte_length + next_char_outputted_byte_length >
+            precision)
+          break;
+        current_outputted_byte_length += next_char_outputted_byte_length;
+      }
+      precision = current_outputted_byte_length;
    } else if (signbit == 15) {
-      precision = strnlen16((const char16_t *)p, precision);
+      for (i = 0; i < precision; ++i) {
+        wc = ((const char16_t *)p)[i];
+        if (wc == 0)
+          break;
+        if (IsUcs2(wc))
+          next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        else if (IsUtf16Cont(wc)) {
+          if (i + 1 < precision && ((const char16_t *)p)[i + 1] != 0)
+            wc = MergeUtf16(wc, ((const char16_t *)p)[i + 1]);
+          next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        } else {
+          next_char_outputted_byte_length = 0;
+        }
+        if (current_outputted_byte_length + next_char_outputted_byte_length >
+            precision)
+          break;
+        current_outputted_byte_length += next_char_outputted_byte_length;
+      }
+      precision = current_outputted_byte_length;
    } else {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overread"
@ -460,13 +546,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
  pad = 0;
  if (width) {
    w = precision;
-    if (signbit == 63) {
-      w = wcsnwidth((const wchar_t *)p, precision, 0);
-    } else if (signbit == 15) {
-      w = strnwidth16((const char16_t *)p, precision, 0);
-    } else {
-      w = strnwidth(p, precision, 0);
-    }
    if (!(flags & FLAGS_NOQUOTE) && (flags & FLAGS_REPR)) {
      w += 2 + (signbit == 63) + (signbit == 15);
    }
@ -502,20 +581,16 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
        return -1;
    }
  } else {
-    while (precision--) {
+    while (precision) {
      if (signbit == 15) {
        wc = *(const char16_t *)p;
        if (!wc && !ignorenul)
          break;
        if (IsUcs2(wc)) {
          p += sizeof(char16_t);
-        } else if (IsUtf16Cont(wc)) {
+        } else if (IsUtf16Cont(wc) &&
+                   precision > __fmt_stoa_get_utf8_size(tpenc(wc))) {
          p += sizeof(char16_t);
-          continue;
-        } else if (!precision) {
-          break;
-        } else {
-          --precision;
          wc = MergeUtf16(wc, *(const char16_t *)p);
        }
      } else if (signbit == 63) {
@ -536,7 +611,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
          wc = ThomPikeByte(wc);
          if (n > precision)
            break;
-          precision -= n;
          while (n--) {
            wc = ThomPikeMerge(wc, *p++);
          }
@ -544,6 +618,11 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
      }
      if (emit(out, arg, wc) == -1)
        return -1;
+      next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+      if (next_char_outputted_byte_length > precision)
+        precision = 0;
+      else
+        precision -= next_char_outputted_byte_length;
    }
  }

@ -792,8 +871,8 @@ static int __fmt_noop(const char *, void *, size_t) {
 *   - `%.*s`    supplied byte length argument (obeys nul terminator)
 *   - ``%`.*s`` supplied byte length argument c escaped (ignores nul term)
 *   - `%#.*s`   supplied byte length argument visualized (ignores nul term)
- *   - `%.*hs`   supplied char16_t length argument (obeys nul terminator)
- *   - `%.*ls`   supplied wchar_t length argument (obeys nul terminator)
+ *   - `%.*hs`   supplied output byte length argument (obeys nul terminator)
+ *   - `%.*ls`   supplied output byte length argument (obeys nul terminator)
 *
 * Formatting Modifiers
 *
@ -1078,7 +1157,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
          p = charbuf;
          qchar = '\'';
          flags |= FLAGS_PRECISION;
-          prec = 1;
+          prec = __fmt_stoa_get_utf8_size(tpenc(charbuf[0]));
          goto FormatString;
        } else if (flags & (FLAGS_QUOTE | FLAGS_REPR)) {
          p = "'\\0'";
--- a/libc/stdio/printf.c
+++ b/libc/stdio/printf.c
@ -33,11 +33,13 @@
 *   consistent with glibc, musl, and uclibc.
 *
 * - `%hs` converts UTF-16/UCS-2 → UTF-8, which can be helpful on Windows.
- *   Formatting (e.g. %-10hs) will use monospace display width rather
- *   than string length or codepoint count.
+ *   Formatting (e.g. %-10hs) will use output byte length (rounded down to
+ *   ensure only full multibyte characters are output) rather than string
+ *   length or codepoint count.
 *
 * - `%ls` (or `%Ls`) converts UTF-32 → UTF-8. Formatting (e.g. %-10ls)
- *   will use monospace display width rather than string length.
+ *   will use output byte length (rounded down to ensure only full multibyte
+ *   characters are output) rather than string length.
 *
 * - The `%#s` and `%#c` alternate forms display values using the
 *   standard IBM standard 256-letter alphabet. Using `%#.*s` to specify
--- a/test/libc/stdio/snprintf_test.c
+++ b/test/libc/stdio/snprintf_test.c
@ -35,3 +35,81 @@ TEST(snprintf, testPlusFlagOnChar) {
  ASSERT_EQ(i, 1);
  ASSERT_STREQ(buf, "=");
 }
+
+TEST(snprintf, testStringWidth) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "<%9ls>", L"éée");
+
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+
+  i = snprintf(buf, sizeof(buf), "<%9s>", "éée");
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+
+  i = snprintf(buf, sizeof(buf), "<%9hs>", u"éée");
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+}
+
+TEST(snprintf, testStringPrecision) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "%.4ls", L"eeée");
+
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+
+  i = snprintf(buf, sizeof(buf), "%.4s", "eeée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+
+  i = snprintf(buf, sizeof(buf), "%.4hs", u"eeée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+}
+
+TEST(snprintf, testStringPrecisionPartialChar) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "%.4ls", L"eéée");
+
+  ASSERT_EQ(i, 3);
+  ASSERT_STREQ(buf, "eé");
+
+  // Note that partial multibyte characters are fine on non-wide conversions
+  // (whereas they are not written on wide conversions,
+  // according to the standard)
+  i = snprintf(buf, sizeof(buf), "%.4s", "eéée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eé\xc3");
+
+  i = snprintf(buf, sizeof(buf), "%.4hs", u"eéée");
+  ASSERT_EQ(i, 3);
+  ASSERT_STREQ(buf, "eé");
+}
+
+TEST(snprintf, testWideChar) {
+  char buf[100] = { 'a', 'b', 'c' };
+  int i = snprintf(buf, sizeof(buf), "%lc", L'\0');
+
+  ASSERT_EQ(i, 1);
+  ASSERT_EQ(buf[0], '\0');
+  ASSERT_EQ(buf[1], '\0');
+  ASSERT_EQ(buf[2], 'c');
+
+  buf[0] = 'a';
+  buf[1] = 'b';
+  i = snprintf(buf, sizeof(buf), "%hc", u'\0');
+
+  ASSERT_EQ(i, 1);
+  ASSERT_EQ(buf[0], '\0');
+  ASSERT_EQ(buf[1], '\0');
+  ASSERT_EQ(buf[2], 'c');
+
+  i = snprintf(buf, sizeof(buf), "%lc", L'þ');
+  ASSERT_EQ(i, 2);
+  ASSERT_STREQ(buf, "þ");
+
+  i = snprintf(buf, sizeof(buf), "%hc", u'þ');
+  ASSERT_EQ(i, 2);
+  ASSERT_STREQ(buf, "þ");
+}
--- a/test/libc/stdio/sprintf_s.inc
+++ b/test/libc/stdio/sprintf_s.inc
@ -34,23 +34,33 @@ TEST(SUITE(sprintf), testStringLength) {
 }

 TEST(SUITE(sprintf), testCharacterCounting) {
-  ASSERT_STREQ("         ♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
+  ASSERT_STREQ("         ♥♦♣♠☺☻▲", Format(FORMAT("%30"), STRING("♥♦♣♠☺☻▲")));
+  ASSERT_STREQ("♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
 }

 TEST(SUITE(snprintf), testTableFlip) {
  EXPECT_STREQ("Table flip          ", Format("%-20ls", L"Table flip"));
-  EXPECT_STREQ("(╯°□°)╯︵L┻━┻       ", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
-  EXPECT_STREQ("(╯°□°)╯︵u┻━┻       ", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
-  EXPECT_STREQ("ちゃぶ台返し        ", Format("%-20ls", L"ちゃぶ台返し"));
-  EXPECT_STREQ("       (╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
-  EXPECT_STREQ("        ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻       ", Format("%-35ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵u┻━┻", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵u┻━┻       ", Format("%-35hs", u"(╯°□°)╯︵u┻━┻"));
+  EXPECT_STREQ("ちゃぶ台返し  ", Format("%-20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("ちゃぶ台返し        ", Format("%-26ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("       (╯°□°)╯︵L┻━┻", Format("%35ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("  ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("        ちゃぶ台返し", Format("%26ls", L"ちゃぶ台返し"));
 }

 TEST(SUITE(snprintf), testCombiningWidth) {
-  EXPECT_STREQ("H̲E̲L̲L̲O̲     ",
+  EXPECT_STREQ("H̲E̲L̲L̲O̲",
               Format("%-10ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
-  EXPECT_STREQ("     H̲E̲L̲L̲O̲",
+  EXPECT_STREQ("H̲E̲L̲L̲O̲     ",
+               Format("%-20ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
+  EXPECT_STREQ("H̲E̲L̲L̲O̲",
               Format("%10hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
+  EXPECT_STREQ("     H̲E̲L̲L̲O̲",
+               Format("%20hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
 }

 TEST(SUITE(snprintf), testQuoting) {