Update Musl Libc code

We now have implement all of Musl's localization code, the same way that Musl implements localization. You may need setlocale(LC_ALL, "C.UTF-8"), just in case anything stops working as expected.
2025-07-07 03:38:31 +00:00 · 2024-07-30 09:14:57 -07:00 · 2024-07-30 09:14:57 -07:00 · bb815eafaf
commit bb815eafaf
parent d0360bf4bd
116 changed files with 6525 additions and 5523 deletions
--- a/test/libc/str/BUILD.mk
+++ b/test/libc/str/BUILD.mk
@ -36,7 +36,6 @@ TEST_LIBC_STR_DIRECTDEPS =					\
 	LIBC_FMT						\
 	LIBC_INTRIN						\
 	LIBC_LOG						\
-	LIBC_TINYMATH						\
 	LIBC_MEM						\
 	LIBC_NEXGEN32E						\
 	LIBC_RUNTIME						\
@ -45,14 +44,16 @@ TEST_LIBC_STR_DIRECTDEPS =					\
 	LIBC_SYSV						\
 	LIBC_SYSV_CALLS						\
 	LIBC_TESTLIB						\
+	LIBC_TINYMATH						\
 	LIBC_X							\
 	THIRD_PARTY_COMPILER_RT					\
-	THIRD_PARTY_MBEDTLS					\
-	THIRD_PARTY_REGEX					\
-	THIRD_PARTY_ZLIB					\
 	THIRD_PARTY_LIBCXX					\
+	THIRD_PARTY_MBEDTLS					\
+	THIRD_PARTY_MUSL					\
+	THIRD_PARTY_REGEX					\
 	THIRD_PARTY_SMALLZ4					\
-	THIRD_PARTY_VQSORT
+	THIRD_PARTY_VQSORT					\
+	THIRD_PARTY_ZLIB					\

 TEST_LIBC_STR_DEPS :=						\
 	$(call uniq,$(foreach x,$(TEST_LIBC_STR_DIRECTDEPS),$($(x))))
--- a/test/libc/str/regex_test.c
+++ b/test/libc/str/regex_test.c
@ -19,10 +19,15 @@
 #include "third_party/regex/regex.h"
 #include "libc/mem/gc.h"
 #include "libc/mem/mem.h"
+#include "libc/str/locale.h"
 #include "libc/str/str.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"

+void SetUpOnce(void) {
+  setlocale(LC_ALL, "C.UTF-8");
+}
+
 TEST(regex, test) {
  regex_t rx;
  EXPECT_EQ(REG_OK, regcomp(&rx, "^[A-Za-z\x7f-\uffff]{2}$", REG_EXTENDED));
--- a/test/libc/str/setlocale_test.c
+++ b/test/libc/str/setlocale_test.c
@ -1,30 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2022 Gavin Arthur Hayes                                            │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/str/locale.h"
-#include "libc/testlib/testlib.h"
-
-TEST(setlocale, test) {
-  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
-  EXPECT_STREQ("C", setlocale(LC_ALL, "C"));
-  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
-  EXPECT_STREQ("POSIX", setlocale(LC_ALL, "POSIX"));
-  EXPECT_STREQ("C", setlocale(LC_ALL, ""));
-  EXPECT_EQ(0, setlocale(LC_ALL, "ja_JP.PCK"));
-  EXPECT_STREQ("C", setlocale(LC_ALL, NULL));
-}
--- a/test/libc/time/BUILD.mk
+++ b/test/libc/time/BUILD.mk
@ -28,7 +28,8 @@ TEST_LIBC_TIME_DIRECTDEPS =				\
 	LIBC_SYSV					\
 	LIBC_TESTLIB					\
 	LIBC_X						\
-	THIRD_PARTY_TZ
+	THIRD_PARTY_MUSL				\
+	THIRD_PARTY_TZ					\

 TEST_LIBC_TIME_DEPS :=					\
 	$(call uniq,$(foreach x,$(TEST_LIBC_TIME_DIRECTDEPS),$($(x))))
--- a/test/libc/tinymath/BUILD.mk
+++ b/test/libc/tinymath/BUILD.mk
@ -8,15 +8,16 @@ TEST_LIBC_TINYMATH_SRCS_CC := $(wildcard test/libc/tinymath/*.cc)
 TEST_LIBC_TINYMATH_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_TINYMATH_SRCS))

 TEST_LIBC_TINYMATH_SRCS =					\
-	$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o)		\
-	$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)
+	$(TEST_LIBC_TINYMATH_SRCS_C)				\
+	$(TEST_LIBC_TINYMATH_SRCS_CC)

 TEST_LIBC_TINYMATH_OBJS =					\
 	$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o)		\
 	$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)

 TEST_LIBC_TINYMATH_COMS =					\
-	$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%)
+	$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%)		\
+	$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%)

 TEST_LIBC_TINYMATH_BINS =					\
 	$(TEST_LIBC_TINYMATH_COMS)				\
@ -68,10 +69,6 @@ $(TEST_LIBC_TINYMATH_OBJS): private				\
 		CFLAGS +=					\
 			-fno-builtin

-$(TEST_LIBC_TINYMATH_OBJS): private				\
-		CXXFLAGS +=					\
-			#-ffast-math
-
 .PHONY: o/$(MODE)/test/libc/tinymath
 o/$(MODE)/test/libc/tinymath:					\
 		$(TEST_LIBC_TINYMATH_BINS)			\
--- a/test/posix/BUILD.mk
+++ b/test/posix/BUILD.mk
@ -35,7 +35,8 @@ TEST_POSIX_DIRECTDEPS =				\
 	LIBC_STDIO				\
 	LIBC_STR				\
 	LIBC_SYSV				\
-	LIBC_THREAD
+	LIBC_THREAD				\
+	THIRD_PARTY_MUSL			\

 TEST_POSIX_DEPS :=				\
 	$(call uniq,$(foreach x,$(TEST_POSIX_DIRECTDEPS),$($(x))))
--- a/test/posix/iconv_utf8_utf16_test.c
+++ b/test/posix/iconv_utf8_utf16_test.c
@ -0,0 +1,173 @@
+#include <errno.h>
+#include <iconv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <uchar.h>
+
+#define INBUF_SIZE  1024
+#define OUTBUF_SIZE 2048
+
+int g_count;
+
+int check_conversion(const char* input, size_t input_len,
+                     const char16_t* expected_output, size_t expected_len) {
+  iconv_t cd;
+  char inbuf[INBUF_SIZE];
+  char outbuf[OUTBUF_SIZE];
+  char* inptr = inbuf;
+  char* outptr = outbuf;
+  size_t inbytesleft = input_len;
+  size_t outbytesleft = OUTBUF_SIZE;
+  size_t result;
+
+  ++g_count;
+
+  memcpy(inbuf, input, input_len);
+
+  cd = iconv_open("UTF-16LE", "UTF-8");
+  if (cd == (iconv_t)-1) {
+    return 10 + g_count;  // iconv_open failed
+  }
+
+  result = iconv(cd, &inptr, &inbytesleft, &outptr, &outbytesleft);
+  if (result == (size_t)-1) {
+    iconv_close(cd);
+    return 20 + g_count;  // iconv failed, return 20 + specific errno
+  }
+
+  if (inbytesleft != 0) {
+    iconv_close(cd);
+    return 40 + g_count;  // Not all input was converted
+  }
+
+  size_t output_len = OUTBUF_SIZE - outbytesleft;
+  if (output_len != expected_len) {
+    iconv_close(cd);
+    return 50 + g_count;  // Output length mismatch
+  }
+
+  if (memcmp(outbuf, expected_output, output_len) != 0) {
+    iconv_close(cd);
+    return 60 + g_count;  // Output content mismatch
+  }
+
+  if (iconv_close(cd) == -1)
+    return 70 + g_count;  // iconv_close failed
+
+  // Reverse direction check: UTF-16LE back to UTF-8
+  cd = iconv_open("UTF-8", "UTF-16LE");
+  if (cd == (iconv_t)-1) {
+    return 80 + g_count;  // iconv_open failed for reverse direction
+  }
+
+  char reverse_inbuf[OUTBUF_SIZE];
+  char reverse_outbuf[INBUF_SIZE];
+  char* reverse_inptr = reverse_inbuf;
+  char* reverse_outptr = reverse_outbuf;
+  size_t reverse_inbytesleft = output_len;
+  size_t reverse_outbytesleft = INBUF_SIZE;
+
+  memcpy(reverse_inbuf, outbuf, output_len);
+
+  result = iconv(cd, &reverse_inptr, &reverse_inbytesleft, &reverse_outptr,
+                 &reverse_outbytesleft);
+  if (result == (size_t)-1) {
+    iconv_close(cd);
+    return 90 + g_count;  // iconv failed for reverse direction
+  }
+
+  if (reverse_inbytesleft != 0) {
+    iconv_close(cd);
+    return 100 + g_count;  // Not all input was converted in reverse direction
+  }
+
+  size_t reverse_output_len = INBUF_SIZE - reverse_outbytesleft;
+  if (reverse_output_len != input_len) {
+    iconv_close(cd);
+    return 110 + g_count;  // Reverse output length mismatch
+  }
+
+  if (memcmp(reverse_outbuf, input, input_len) != 0) {
+    iconv_close(cd);
+    return 120 + g_count;  // Reverse output content mismatch
+  }
+
+  if (iconv_close(cd) == -1)
+    return 130 + g_count;  // iconv_close failed for reverse direction
+
+  return 0;  // Success
+}
+
+int main() {
+  // Test case 1: Basic ASCII
+  const char input1[] = "Hello, world!";
+  const char16_t expected1[] = u"Hello, world!";
+  int result = check_conversion(input1, sizeof(input1) - 1, expected1,
+                                sizeof(expected1) - 2);
+  if (result != 0)
+    return result;
+
+  // Test case 2: Non-ASCII characters and newline
+  const char input2[] = "こんにちは\nWorld! ☺";
+  const char16_t expected2[] = u"こんにちは\nWorld! ☺";
+  result = check_conversion(input2, sizeof(input2) - 1, expected2,
+                            sizeof(expected2) - 2);
+  if (result != 0)
+    return result;
+
+  // Test case 3: Empty string
+  const char input3[] = "";
+  const char16_t expected3[] = u"";
+  result = check_conversion(input3, 0, expected3, 0);
+  if (result != 0)
+    return result;
+
+  // Test case 4: String with null characters
+  const char input4[] = "Hello\0World";
+  const char16_t expected4[] = u"Hello\0World";
+  result = check_conversion(input4, sizeof(input4) - 1, expected4,
+                            sizeof(expected4) - 2);
+  if (result != 0)
+    return result;
+
+  // Test case 5: Long string to test buffer handling
+  char input5[INBUF_SIZE];
+  char16_t expected5[INBUF_SIZE];
+  memset(input5, 'A', INBUF_SIZE - 1);
+  input5[INBUF_SIZE - 1] = '\0';
+  for (int i = 0; i < INBUF_SIZE - 1; i++) {
+    expected5[i] = u'A';
+  }
+  result =
+      check_conversion(input5, INBUF_SIZE - 1, expected5, (INBUF_SIZE - 1) * 2);
+  if (result != 0)
+    return result;
+
+  // Test case 6: Invalid UTF-8 sequence
+  const char input6[] = {0xC0, 0x80};
+  result = check_conversion(input6, sizeof(input6), NULL, 0);
+  if (result != 26) {
+    if (errno != EILSEQ)
+      return 201;
+    return 200;
+  }
+
+  // Test case 7: Mixing ASCII and non-ASCII
+  const char input7[] = "Hello, 世界!";
+  const char16_t expected7[] = u"Hello, 世界!";
+  result = check_conversion(input7, sizeof(input7) - 1, expected7,
+                            sizeof(expected7) - 2);
+  if (result != 0)
+    return result;
+
+  // Test case 8: Surrogate pairs
+  const char input8[] = "𐐷";  // U+10437
+  const char16_t expected8[] =
+      u"𐐷";  // This will be encoded as a surrogate pair
+  result = check_conversion(input8, sizeof(input8) - 1, expected8,
+                            sizeof(expected8) - 2);
+  if (result != 0)
+    return result;
+
+  return 0;  // All tests passed
+}
--- a/test/posix/iconv_utf8_utf32_test.c
+++ b/test/posix/iconv_utf8_utf32_test.c
@ -0,0 +1,172 @@
+#include <errno.h>
+#include <iconv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <uchar.h>
+
+#define INBUF_SIZE  1024
+#define OUTBUF_SIZE 4096
+
+int g_count;
+
+int check_conversion(const char* input, size_t input_len,
+                     const wchar_t* expected_output, size_t expected_len) {
+  iconv_t cd;
+  char inbuf[INBUF_SIZE];
+  char outbuf[OUTBUF_SIZE];
+  char* inptr = inbuf;
+  char* outptr = outbuf;
+  size_t inbytesleft = input_len;
+  size_t outbytesleft = OUTBUF_SIZE;
+  size_t result;
+
+  ++g_count;
+
+  memcpy(inbuf, input, input_len);
+
+  cd = iconv_open("UTF-32LE", "UTF-8");
+  if (cd == (iconv_t)-1) {
+    return 10 + g_count;  // iconv_open failed
+  }
+
+  result = iconv(cd, &inptr, &inbytesleft, &outptr, &outbytesleft);
+  if (result == (size_t)-1) {
+    iconv_close(cd);
+    return 20 + g_count;  // iconv failed, return 20 + specific errno
+  }
+
+  if (inbytesleft != 0) {
+    iconv_close(cd);
+    return 40 + g_count;  // Not all input was converted
+  }
+
+  size_t output_len = OUTBUF_SIZE - outbytesleft;
+  if (output_len != expected_len) {
+    iconv_close(cd);
+    return 50 + g_count;  // Output length mismatch
+  }
+
+  if (memcmp(outbuf, expected_output, output_len) != 0) {
+    iconv_close(cd);
+    return 60 + g_count;  // Output content mismatch
+  }
+
+  if (iconv_close(cd) == -1)
+    return 70 + g_count;  // iconv_close failed
+
+  // Reverse direction check: UTF-32LE back to UTF-8
+  cd = iconv_open("UTF-8", "UTF-32LE");
+  if (cd == (iconv_t)-1) {
+    return 80 + g_count;  // iconv_open failed for reverse direction
+  }
+
+  char reverse_inbuf[OUTBUF_SIZE];
+  char reverse_outbuf[INBUF_SIZE];
+  char* reverse_inptr = reverse_inbuf;
+  char* reverse_outptr = reverse_outbuf;
+  size_t reverse_inbytesleft = output_len;
+  size_t reverse_outbytesleft = INBUF_SIZE;
+
+  memcpy(reverse_inbuf, outbuf, output_len);
+
+  result = iconv(cd, &reverse_inptr, &reverse_inbytesleft, &reverse_outptr,
+                 &reverse_outbytesleft);
+  if (result == (size_t)-1) {
+    iconv_close(cd);
+    return 90 + g_count;  // iconv failed for reverse direction
+  }
+
+  if (reverse_inbytesleft != 0) {
+    iconv_close(cd);
+    return 100 + g_count;  // Not all input was converted in reverse direction
+  }
+
+  size_t reverse_output_len = INBUF_SIZE - reverse_outbytesleft;
+  if (reverse_output_len != input_len) {
+    iconv_close(cd);
+    return 110 + g_count;  // Reverse output length mismatch
+  }
+
+  if (memcmp(reverse_outbuf, input, input_len) != 0) {
+    iconv_close(cd);
+    return 120 + g_count;  // Reverse output content mismatch
+  }
+
+  if (iconv_close(cd) == -1)
+    return 130 + g_count;  // iconv_close failed for reverse direction
+
+  return 0;  // Success
+}
+
+int main() {
+  // Test case 1: Basic ASCII
+  const char input1[] = "Hello, world!";
+  const wchar_t expected1[] = L"Hello, world!";
+  int result = check_conversion(input1, sizeof(input1) - 1, expected1,
+                                sizeof(expected1) - 4);
+  if (result != 0)
+    return result;
+
+  // Test case 2: Non-ASCII characters and newline
+  const char input2[] = "こんにちは\nWorld! ☺";
+  const wchar_t expected2[] = L"こんにちは\nWorld! ☺";
+  result = check_conversion(input2, sizeof(input2) - 1, expected2,
+                            sizeof(expected2) - 4);
+  if (result != 0)
+    return result;
+
+  // Test case 3: Empty string
+  const char input3[] = "";
+  const wchar_t expected3[] = L"";
+  result = check_conversion(input3, 0, expected3, 0);
+  if (result != 0)
+    return result;
+
+  // Test case 4: String with null characters
+  const char input4[] = "Hello\0World";
+  const wchar_t expected4[] = L"Hello\0World";
+  result = check_conversion(input4, sizeof(input4) - 1, expected4,
+                            sizeof(expected4) - 4);
+  if (result != 0)
+    return result;
+
+  // Test case 5: Long string to test buffer handling
+  char input5[INBUF_SIZE];
+  wchar_t expected5[INBUF_SIZE];
+  memset(input5, 'A', INBUF_SIZE - 1);
+  input5[INBUF_SIZE - 1] = '\0';
+  for (int i = 0; i < INBUF_SIZE - 1; i++) {
+    expected5[i] = u'A';
+  }
+  result =
+      check_conversion(input5, INBUF_SIZE - 1, expected5, (INBUF_SIZE - 1) * 4);
+  if (result != 0)
+    return result;
+
+  // Test case 6: Invalid UTF-8 sequence
+  const char input6[] = {0xC0, 0x80};
+  result = check_conversion(input6, sizeof(input6), NULL, 0);
+  if (result != 26) {
+    if (errno != EILSEQ)
+      return 201;
+    return 200;
+  }
+
+  // Test case 7: Mixing ASCII and non-ASCII
+  const char input7[] = "Hello, 世界!";
+  const wchar_t expected7[] = L"Hello, 世界!";
+  result = check_conversion(input7, sizeof(input7) - 1, expected7,
+                            sizeof(expected7) - 4);
+  if (result != 0)
+    return result;
+
+  // Test case 8: Surrogate pairs
+  const char input8[] = "𐐷";         // U+10437
+  const wchar_t expected8[] = L"𐐷";  // This will be encoded as a surrogate pair
+  result = check_conversion(input8, sizeof(input8) - 1, expected8,
+                            sizeof(expected8) - 4);
+  if (result != 0)
+    return result;
+
+  return 0;  // All tests passed
+}