From a779fc78858ca8f61170d1016368ece21bb95b93 Mon Sep 17 00:00:00 2001
From: Gabriel Ravier <gabravier@gmail.com>
Date: Sun, 1 Sep 2024 00:27:03 +0200
Subject: [PATCH] Fix %[hl]s field length/precision on printf funcs

Cosmopolitan currently has a completely wrong understanding (according
to the C Standard) of how to handle precision and field length for
wide strings, which have to be handled not by using the
monospace display width but instead by using the outputted byte
width (i.e. the UTF-8 length of the wide string).

This patch corrects this, corrects comments documenting the previous
behavior, adds tests for the behavior and refactors tests that tested
for the previous behavior to correctly use the new correct behavior.
---
 libc/stdio/fmt.c                | 125 ++++++++++++++++++++++++++------
 libc/stdio/printf.c             |   8 +-
 test/libc/stdio/snprintf_test.c |  78 ++++++++++++++++++++
 test/libc/stdio/sprintf_s.inc   |  26 +++++--
 4 files changed, 203 insertions(+), 34 deletions(-)

diff --git a/libc/stdio/fmt.c b/libc/stdio/fmt.c
index ee558e6f0..4af7c88ce 100644
--- a/libc/stdio/fmt.c
+++ b/libc/stdio/fmt.c
@@ -365,19 +365,24 @@ static int __fmt_stoa_byte(out_f out, void *a, uint64_t c) {
   return out(buf, a, 1);
 }
 
+static size_t __fmt_stoa_get_utf8_size(uint64_t w)
+{
+  return w ? (bsr(w) >> 3) + 1 : 1;
+}
+
 static int __fmt_stoa_wide(out_f out, void *a, uint64_t w) {
   char buf[8];
   if (!isascii(w))
     w = tpenc(w);
   WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }
 
 static int __fmt_stoa_bing(out_f out, void *a, uint64_t w) {
   char buf[8];
   w = tpenc(kCp437[w & 0xFF]);
   WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }
 
 static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
@@ -388,7 +393,7 @@ static int __fmt_stoa_quoted(out_f out, void *a, uint64_t w) {
     w = tpenc(w);
   }
   WRITE64LE(buf, w);
-  return out(buf, a, w ? (bsr(w) >> 3) + 1 : 1);
+  return out(buf, a, __fmt_stoa_get_utf8_size(w));
 }
 
 /**
@@ -410,6 +415,7 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
   char *p, buf[1];
   unsigned w, pad;
   bool justdobytes, ignorenul;
+  size_t i, next_char_outputted_byte_length, current_outputted_byte_length;
 
   p = data;
   if (!p) {
@@ -442,13 +448,93 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
     emit = __fmt_stoa_byte;
   }
 
+  /*
+    The standard specifies precision and field length in terms of bytes, not
+    characters, despite the confusing mention of "characters" when defining
+    field length (in the C standard, "character" usually actually means
+    "byte").
+
+    This is made clear in the standard by this example:
+    > EXAMPLE 2 - In this example, multibyte characters do not have a
+    > state-dependent encoding, and the members of the extended character set
+    > that consist of more than one byte each consist of exactly two bytes, the
+    > first of which is denoted here by a □ and the second by an uppercase
+    > letter.
+    >
+    > Given the following wide string with length seven,
+    >
+    >     static wchar_t wstr[] = L"□X□Yabc□Z□W";
+    >
+    > the seven calls
+    >
+    >     fprintf(stdout, "|1234567890123|\n");
+    >     fprintf(stdout, "|%13ls|\n", wstr);
+    >     fprintf(stdout, "|%-13.9ls|\n", wstr);
+    >     fprintf(stdout, "|%13.10ls|\n", wstr);
+    >     fprintf(stdout, "|%13.11ls|\n", wstr);
+    >     fprintf(stdout, "|%13.15ls|\n", &wstr[2]);
+    >     fprintf(stdout, "|%13lc|\n", (wint_t) wstr[5]);
+    >
+    > will print the following seven lines:
+    >
+    >     |1234567890123|
+    >     |  □X□Yabc□Z□W|
+    >     |□X□Yabc□Z    |
+    >     |    □X□Yabc□Z|
+    >     |  □X□Yabc□Z□W|
+    >     |      abc□Z□W|
+    >     |           □Z|
+    - C Standard, 7.23.5.6.18, The fprintf function
+
+    The example explicitly shows that precision and field width are interpreted
+    as output byte length.
+
+    Thus, to determine how many wide characters we must output, we need to
+    calculate how many bytes we are going to output.
+
+    Note that the standard also states that "In no case is a partial multibyte
+    character written", when specifying the behavior of wide string conversions,
+    meaning we need to make sure that we stop before any character that would
+    result in more bytes being outputted than the precision specified is reached
+    (this is not the case for non-wide string conversions, meaning those may
+    write partial multibyte characters just fine)
+  */
   if (!(flags & FLAGS_PRECISION))
     precision = -1;
   if (!(flags & FLAGS_PRECISION) || !ignorenul) {
+    current_outputted_byte_length = 0;
     if (signbit == 63) {
-      precision = wcsnlen((const wchar_t *)p, precision);
+      for (i = 0; i < precision; ++i) {
+        wc = ((const wchar_t *)p)[i];
+        if (wc == L'\0')
+          break;
+        next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        if (current_outputted_byte_length + next_char_outputted_byte_length >
+            precision)
+          break;
+        current_outputted_byte_length += next_char_outputted_byte_length;
+      }
+      precision = current_outputted_byte_length;
     } else if (signbit == 15) {
-      precision = strnlen16((const char16_t *)p, precision);
+      for (i = 0; i < precision; ++i) {
+        wc = ((const char16_t *)p)[i];
+        if (wc == 0)
+          break;
+        if (IsUcs2(wc))
+          next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        else if (IsUtf16Cont(wc)) {
+          if (i + 1 < precision && ((const char16_t *)p)[i + 1] != 0)
+            wc = MergeUtf16(wc, ((const char16_t *)p)[i + 1]);
+          next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+        } else {
+          next_char_outputted_byte_length = 0;
+        }
+        if (current_outputted_byte_length + next_char_outputted_byte_length >
+            precision)
+          break;
+        current_outputted_byte_length += next_char_outputted_byte_length;
+      }
+      precision = current_outputted_byte_length;
     } else {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overread"
@@ -460,13 +546,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
   pad = 0;
   if (width) {
     w = precision;
-    if (signbit == 63) {
-      w = wcsnwidth((const wchar_t *)p, precision, 0);
-    } else if (signbit == 15) {
-      w = strnwidth16((const char16_t *)p, precision, 0);
-    } else {
-      w = strnwidth(p, precision, 0);
-    }
     if (!(flags & FLAGS_NOQUOTE) && (flags & FLAGS_REPR)) {
       w += 2 + (signbit == 63) + (signbit == 15);
     }
@@ -502,20 +581,16 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
         return -1;
     }
   } else {
-    while (precision--) {
+    while (precision) {
       if (signbit == 15) {
         wc = *(const char16_t *)p;
         if (!wc && !ignorenul)
           break;
         if (IsUcs2(wc)) {
           p += sizeof(char16_t);
-        } else if (IsUtf16Cont(wc)) {
+        } else if (IsUtf16Cont(wc) &&
+                   precision > __fmt_stoa_get_utf8_size(tpenc(wc))) {
           p += sizeof(char16_t);
-          continue;
-        } else if (!precision) {
-          break;
-        } else {
-          --precision;
           wc = MergeUtf16(wc, *(const char16_t *)p);
         }
       } else if (signbit == 63) {
@@ -536,7 +611,6 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
           wc = ThomPikeByte(wc);
           if (n > precision)
             break;
-          precision -= n;
           while (n--) {
             wc = ThomPikeMerge(wc, *p++);
           }
@@ -544,6 +618,11 @@ static int __fmt_stoa(int out(const char *, void *, size_t), void *arg,
       }
       if (emit(out, arg, wc) == -1)
         return -1;
+      next_char_outputted_byte_length = __fmt_stoa_get_utf8_size(tpenc(wc));
+      if (next_char_outputted_byte_length > precision)
+        precision = 0;
+      else
+        precision -= next_char_outputted_byte_length;
     }
   }
 
@@ -792,8 +871,8 @@ static int __fmt_noop(const char *, void *, size_t) {
  *   - `%.*s`    supplied byte length argument (obeys nul terminator)
  *   - ``%`.*s`` supplied byte length argument c escaped (ignores nul term)
  *   - `%#.*s`   supplied byte length argument visualized (ignores nul term)
- *   - `%.*hs`   supplied char16_t length argument (obeys nul terminator)
- *   - `%.*ls`   supplied wchar_t length argument (obeys nul terminator)
+ *   - `%.*hs`   supplied output byte length argument (obeys nul terminator)
+ *   - `%.*ls`   supplied output byte length argument (obeys nul terminator)
  *
  * Formatting Modifiers
  *
@@ -1078,7 +1157,7 @@ int __fmt(void *fn, void *arg, const char *format, va_list va, int *wrote) {
           p = charbuf;
           qchar = '\'';
           flags |= FLAGS_PRECISION;
-          prec = 1;
+          prec = __fmt_stoa_get_utf8_size(tpenc(charbuf[0]));
           goto FormatString;
         } else if (flags & (FLAGS_QUOTE | FLAGS_REPR)) {
           p = "'\\0'";
diff --git a/libc/stdio/printf.c b/libc/stdio/printf.c
index a6898176d..afbc7827b 100644
--- a/libc/stdio/printf.c
+++ b/libc/stdio/printf.c
@@ -33,11 +33,13 @@
  *   consistent with glibc, musl, and uclibc.
  *
  * - `%hs` converts UTF-16/UCS-2 → UTF-8, which can be helpful on Windows.
- *   Formatting (e.g. %-10hs) will use monospace display width rather
- *   than string length or codepoint count.
+ *   Formatting (e.g. %-10hs) will use output byte length (rounded down to
+ *   ensure only full multibyte characters are output) rather than string
+ *   length or codepoint count.
  *
  * - `%ls` (or `%Ls`) converts UTF-32 → UTF-8. Formatting (e.g. %-10ls)
- *   will use monospace display width rather than string length.
+ *   will use output byte length (rounded down to ensure only full multibyte
+ *   characters are output) rather than string length.
  *
  * - The `%#s` and `%#c` alternate forms display values using the
  *   standard IBM standard 256-letter alphabet. Using `%#.*s` to specify
diff --git a/test/libc/stdio/snprintf_test.c b/test/libc/stdio/snprintf_test.c
index d0428eaa1..329f7af1e 100644
--- a/test/libc/stdio/snprintf_test.c
+++ b/test/libc/stdio/snprintf_test.c
@@ -35,3 +35,81 @@ TEST(snprintf, testPlusFlagOnChar) {
   ASSERT_EQ(i, 1);
   ASSERT_STREQ(buf, "=");
 }
+
+TEST(snprintf, testStringWidth) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "<%9ls>", L"éée");
+
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+
+  i = snprintf(buf, sizeof(buf), "<%9s>", "éée");
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+
+  i = snprintf(buf, sizeof(buf), "<%9hs>", u"éée");
+  ASSERT_EQ(i, 11);
+  ASSERT_STREQ(buf, "<    éée>");
+}
+
+TEST(snprintf, testStringPrecision) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "%.4ls", L"eeée");
+
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+
+  i = snprintf(buf, sizeof(buf), "%.4s", "eeée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+
+  i = snprintf(buf, sizeof(buf), "%.4hs", u"eeée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eeé");
+}
+
+TEST(snprintf, testStringPrecisionPartialChar) {
+  char buf[100] = {};
+  int i = snprintf(buf, sizeof(buf), "%.4ls", L"eéée");
+
+  ASSERT_EQ(i, 3);
+  ASSERT_STREQ(buf, "eé");
+
+  // Note that partial multibyte characters are fine on non-wide conversions
+  // (whereas they are not written on wide conversions,
+  // according to the standard)
+  i = snprintf(buf, sizeof(buf), "%.4s", "eéée");
+  ASSERT_EQ(i, 4);
+  ASSERT_STREQ(buf, "eé\xc3");
+
+  i = snprintf(buf, sizeof(buf), "%.4hs", u"eéée");
+  ASSERT_EQ(i, 3);
+  ASSERT_STREQ(buf, "eé");
+}
+
+TEST(snprintf, testWideChar) {
+  char buf[100] = { 'a', 'b', 'c' };
+  int i = snprintf(buf, sizeof(buf), "%lc", L'\0');
+
+  ASSERT_EQ(i, 1);
+  ASSERT_EQ(buf[0], '\0');
+  ASSERT_EQ(buf[1], '\0');
+  ASSERT_EQ(buf[2], 'c');
+
+  buf[0] = 'a';
+  buf[1] = 'b';
+  i = snprintf(buf, sizeof(buf), "%hc", u'\0');
+
+  ASSERT_EQ(i, 1);
+  ASSERT_EQ(buf[0], '\0');
+  ASSERT_EQ(buf[1], '\0');
+  ASSERT_EQ(buf[2], 'c');
+
+  i = snprintf(buf, sizeof(buf), "%lc", L'þ');
+  ASSERT_EQ(i, 2);
+  ASSERT_STREQ(buf, "þ");
+
+  i = snprintf(buf, sizeof(buf), "%hc", u'þ');
+  ASSERT_EQ(i, 2);
+  ASSERT_STREQ(buf, "þ");
+}
diff --git a/test/libc/stdio/sprintf_s.inc b/test/libc/stdio/sprintf_s.inc
index 98c39cf88..661eaee27 100644
--- a/test/libc/stdio/sprintf_s.inc
+++ b/test/libc/stdio/sprintf_s.inc
@@ -34,23 +34,33 @@ TEST(SUITE(sprintf), testStringLength) {
 }
 
 TEST(SUITE(sprintf), testCharacterCounting) {
-  ASSERT_STREQ("         ♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
+  ASSERT_STREQ("         ♥♦♣♠☺☻▲", Format(FORMAT("%30"), STRING("♥♦♣♠☺☻▲")));
+  ASSERT_STREQ("♥♦♣♠☺☻▲", Format(FORMAT("%16"), STRING("♥♦♣♠☺☻▲")));
 }
 
 TEST(SUITE(snprintf), testTableFlip) {
   EXPECT_STREQ("Table flip          ", Format("%-20ls", L"Table flip"));
-  EXPECT_STREQ("(╯°□°)╯︵L┻━┻       ", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
-  EXPECT_STREQ("(╯°□°)╯︵u┻━┻       ", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
-  EXPECT_STREQ("ちゃぶ台返し        ", Format("%-20ls", L"ちゃぶ台返し"));
-  EXPECT_STREQ("       (╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
-  EXPECT_STREQ("        ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%-20ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻       ", Format("%-35ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵u┻━┻", Format("%-20hs", u"(╯°□°)╯︵u┻━┻"));
+  EXPECT_STREQ("(╯°□°)╯︵u┻━┻       ", Format("%-35hs", u"(╯°□°)╯︵u┻━┻"));
+  EXPECT_STREQ("ちゃぶ台返し  ", Format("%-20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("ちゃぶ台返し        ", Format("%-26ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("(╯°□°)╯︵L┻━┻", Format("%20ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("       (╯°□°)╯︵L┻━┻", Format("%35ls", L"(╯°□°)╯︵L┻━┻"));
+  EXPECT_STREQ("  ちゃぶ台返し", Format("%20ls", L"ちゃぶ台返し"));
+  EXPECT_STREQ("        ちゃぶ台返し", Format("%26ls", L"ちゃぶ台返し"));
 }
 
 TEST(SUITE(snprintf), testCombiningWidth) {
-  EXPECT_STREQ("H̲E̲L̲L̲O̲     ",
+  EXPECT_STREQ("H̲E̲L̲L̲O̲",
                Format("%-10ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
-  EXPECT_STREQ("     H̲E̲L̲L̲O̲",
+  EXPECT_STREQ("H̲E̲L̲L̲O̲     ",
+               Format("%-20ls", L"H\u0332E\u0332L\u0332L\u0332O\u0332"));
+  EXPECT_STREQ("H̲E̲L̲L̲O̲",
                Format("%10hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
+  EXPECT_STREQ("     H̲E̲L̲L̲O̲",
+               Format("%20hs", u"H\u0332E\u0332L\u0332L\u0332O\u0332"));
 }
 
 TEST(SUITE(snprintf), testQuoting) {