diff --git a/libc/calls/getcpucount.c b/libc/calls/getcpucount.c
index 500ce3c76..f1433d7f8 100644
--- a/libc/calls/getcpucount.c
+++ b/libc/calls/getcpucount.c
@@ -95,6 +95,6 @@ __attribute__((__constructor__)) static void _getcpucount_init(void) {
  *
  * @return cpu count or 0 if it couldn't be determined
  */
-unsigned _getcpucount(void) {
+int _getcpucount(void) {
   return g_cpucount;
 }
diff --git a/libc/calls/struct/sched_param.h b/libc/calls/struct/sched_param.h
index d30339c33..e445ef056 100644
--- a/libc/calls/struct/sched_param.h
+++ b/libc/calls/struct/sched_param.h
@@ -2,6 +2,7 @@
 #define COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_
 #include "libc/calls/struct/timespec.h"
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
 
 struct sched_param {
   int32_t sched_priority;
@@ -15,5 +16,6 @@ int sched_rr_get_interval(int, struct timespec *);
 int sched_setparam(int, const struct sched_param *);
 int sched_setscheduler(int, int, const struct sched_param *);
 
+COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
 #endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_SCHED_PARAM_H_ */
diff --git a/libc/intrin/fmax.c b/libc/intrin/fmax.c
index d4ebd3bd1..0719e6d7d 100644
--- a/libc/intrin/fmax.c
+++ b/libc/intrin/fmax.c
@@ -26,10 +26,10 @@
  * signed zeroes.
  */
 double fmax(double x, double y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbit(x) != __builtin_signbit(y)) {
-    return __builtin_signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
   }
   return x < y ? y : x;
 }
diff --git a/libc/intrin/fmaxf.c b/libc/intrin/fmaxf.c
index b2d7613fd..522352bde 100644
--- a/libc/intrin/fmaxf.c
+++ b/libc/intrin/fmaxf.c
@@ -26,10 +26,10 @@
  * signed zeroes.
  */
 float fmaxf(float x, float y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbitf(x) != __builtin_signbitf(y)) {
-    return __builtin_signbitf(x) ? y : x; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
   }
   return x < y ? y : x;
 }
diff --git a/libc/intrin/fmaxl.c b/libc/intrin/fmaxl.c
index ef4ee057a..392c0700c 100644
--- a/libc/intrin/fmaxl.c
+++ b/libc/intrin/fmaxl.c
@@ -27,10 +27,10 @@
  * signed zeroes.
  */
 long double fmaxl(long double x, long double y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbitl(x) != __builtin_signbitl(y)) {
-    return __builtin_signbitl(x) ? y : x; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? y : x; /* C99 Annex F.9.9.2 */
   }
   return x < y ? y : x;
 }
diff --git a/libc/math.h b/libc/math.h
index 2441f4c40..45d7ba16d 100644
--- a/libc/math.h
+++ b/libc/math.h
@@ -88,7 +88,6 @@ typedef double double_t;
 #define isnan(x)             __builtin_isnan(x)
 #define isfinite(x)          __builtin_isfinite(x)
 #define isnormal(x)          __builtin_isnormal(x)
-#define signbit(x)           __builtin_signbit(x)
 #define isgreater(x, y)      __builtin_isgreater(x, y)
 #define isgreaterequal(x, y) __builtin_isgreaterequal(x, y)
 #define isless(x, y)         __builtin_isless(x, y)
@@ -99,6 +98,11 @@ typedef double double_t;
 #define fpclassify(x) \
   __builtin_fpclassify(FP_NAN, FP_INFINITE, FP_NORMAL, FP_SUBNORMAL, FP_ZERO, x)
 
+#define signbit(x)                                      \
+  (sizeof(x) == sizeof(double)  ? __builtin_signbit(x)  \
+   : sizeof(x) == sizeof(float) ? __builtin_signbitf(x) \
+                                : __builtin_signbitl(x))
+
 extern int signgam;
 
 double acos(double);
@@ -305,7 +309,7 @@ void sincos(double, double *, double *);
 void sincosf(float, float *, float *);
 void sincosl(long double, long double *, long double *);
 
-float fsumf(const float *, size_t);
+double fsumf(const float *, size_t);
 double fsum(const double *, size_t);
 
 double j0(double);
diff --git a/libc/runtime/runtime.h b/libc/runtime/runtime.h
index fecb3f2a6..8e76eeb0d 100644
--- a/libc/runtime/runtime.h
+++ b/libc/runtime/runtime.h
@@ -99,7 +99,7 @@ void _intsort(int *, size_t);
 void _longsort(long *, size_t);
 bool _isheap(void *);
 int NtGetVersion(void) pureconst;
-unsigned _getcpucount(void) pureconst;
+int _getcpucount(void) pureconst;
 long _missingno();
 void __oom_hook(size_t);
 void _loadxmm(void *);
diff --git a/libc/tinymath/acos.c b/libc/tinymath/acos.c
index 40a1cb667..939a87abf 100644
--- a/libc/tinymath/acos.c
+++ b/libc/tinymath/acos.c
@@ -141,3 +141,7 @@ double acos(double x)
 	w = R(z)*s+c;
 	return 2*(df+w);
 }
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+__strong_reference(acos, acosl);
+#endif
diff --git a/libc/tinymath/acosf.c b/libc/tinymath/acosf.c
index 28e1d3f47..15c5c902d 100644
--- a/libc/tinymath/acosf.c
+++ b/libc/tinymath/acosf.c
@@ -32,7 +32,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off
 
 /* origin: FreeBSD /usr/src/lib/msun/src/e_acosf.c */
 /*
diff --git a/libc/tinymath/acosh.c b/libc/tinymath/acosh.c
index 6259fed68..35286d201 100644
--- a/libc/tinymath/acosh.c
+++ b/libc/tinymath/acosh.c
@@ -31,7 +31,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off
 
 /**
  * Returns inverse hyperbolic cosine of 𝑥.
@@ -53,3 +53,7 @@ double acosh(double x)
 	/* |x| >= 0x1p26 or nan */
 	return log(x) + 0.693147180559945309417232121458176568;
 }
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+__strong_reference(acosh, acoshl);
+#endif
diff --git a/libc/tinymath/acoshl.c b/libc/tinymath/acoshl.c
index 6116e043d..132aa4a60 100644
--- a/libc/tinymath/acoshl.c
+++ b/libc/tinymath/acoshl.c
@@ -38,6 +38,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
+#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
 
 asm(".ident\t\"\\n\\n\
 FreeBSD libm (BSD-2 License)\\n\
@@ -62,8 +63,6 @@ asm(".include \"libc/disclaimer.inc\"");
 #error "Unsupported long double format"
 #endif
 
-#define	BIAS	(LDBL_MAX_EXP - 1)
-
 static const double
 one	= 1.0;
 
@@ -108,3 +107,5 @@ acoshl(long double x)
 	    RETURNI(log1pl(t+sqrtl(2.0*t+t*t)));
 	}
 }
+
+#endif /* long double is long */
diff --git a/libc/tinymath/acosl.c b/libc/tinymath/acosl.c
index bd9d47693..7958cd307 100644
--- a/libc/tinymath/acosl.c
+++ b/libc/tinymath/acosl.c
@@ -28,6 +28,7 @@
 #include "libc/math.h"
 #include "libc/tinymath/invtrigl.internal.h"
 #include "libc/tinymath/ldshape.internal.h"
+#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
 
 asm(".ident\t\"\\n\\n\
 fdlibm (fdlibm license)\\n\
@@ -54,22 +55,20 @@ asm(".include \"libc/disclaimer.inc\"");
  * Converted to long double by David Schultz <das@FreeBSD.ORG>.
  */
 
-/**
- * Returns arc cosine of 𝑥.
- *
- * @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
- * @domain -1 ≤ 𝑥 ≤ 1
- */
-long double acosl(long double x) {
-#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
-	return acos(x);
-#elif (LDBL_MANT_DIG == 64 || LDBL_MANT_DIG == 113) && LDBL_MAX_EXP == 16384
 #if LDBL_MANT_DIG == 64
 #define CLEARBOTTOM(u) (u.i.m &= -1ULL << 32)
 #elif LDBL_MANT_DIG == 113
 #define CLEARBOTTOM(u) (u.i.lo = 0)
 #endif
 
+/**
+ * Returns arc cosine of 𝑥.
+ *
+ * @define atan2(fabs(sqrt((1-𝑥)*(1+𝑥))),𝑥)
+ * @domain -1 ≤ 𝑥 ≤ 1
+ */
+long double acosl(long double x)
+{
 	union ldshape u = {x};
 	long double z, s, c, f;
 	uint16_t e = u.i.se & 0x7fff;
@@ -102,8 +101,6 @@ long double acosl(long double x) {
 	f = u.f;
 	c = (z - f*f)/(s + f);
 	return 2*(__invtrigl_R(z)*s + c + f);
-
-#else
-#error "architecture unsupported"
-#endif
 }
+
+#endif /* long double is long */
diff --git a/libc/tinymath/asinf.c b/libc/tinymath/asinf.c
index 0e288a0d0..2af22e009 100644
--- a/libc/tinymath/asinf.c
+++ b/libc/tinymath/asinf.c
@@ -35,7 +35,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
+// clang-format off
 
 /* origin: FreeBSD /usr/src/lib/msun/src/e_asinf.c */
 /*
diff --git a/libc/tinymath/asinh.c b/libc/tinymath/asinh.c
index fab4113f9..dcbbf87f5 100644
--- a/libc/tinymath/asinh.c
+++ b/libc/tinymath/asinh.c
@@ -64,3 +64,7 @@ double asinh(double x)
 	}
 	return s ? -x : x;
 }
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+__strong_reference(asinh, asinhl);
+#endif
diff --git a/libc/tinymath/asinhl.c b/libc/tinymath/asinhl.c
index 26640a763..ec6992e14 100644
--- a/libc/tinymath/asinhl.c
+++ b/libc/tinymath/asinhl.c
@@ -38,6 +38,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
 #include "libc/tinymath/freebsd.internal.h"
+#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
 
 asm(".ident\t\"\\n\\n\
 FreeBSD libm (BSD-2 License)\\n\
@@ -65,8 +66,6 @@ asm(".include \"libc/disclaimer.inc\"");
 #error "Unsupported long double format"
 #endif
 
-#define	BIAS	(LDBL_MAX_EXP - 1)
-
 static const double
 one =  1.00000000000000000000e+00, /* 0x3FF00000, 0x00000000 */
 huge=  1.00000000000000000000e+300;
@@ -110,3 +109,5 @@ asinhl(long double x)
 	}
 	RETURNI((hx & 0x8000) == 0 ? w : -w);
 }
+
+#endif /* long double is long */
diff --git a/libc/tinymath/cacos.c b/libc/tinymath/cacos.c
index e37badfe2..7b60e4257 100644
--- a/libc/tinymath/cacos.c
+++ b/libc/tinymath/cacos.c
@@ -33,9 +33,7 @@ asm(".ident\t\"\\n\\n\
 Musl libc (MIT License)\\n\
 Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
-/* clang-format off */
-
-
+// clang-format off
 
 // FIXME: Hull et al. "Implementing the complex arcsine and arccosine functions using exception handling" 1997
 
diff --git a/libc/tinymath/catan.c b/libc/tinymath/catan.c
index 8a0263db4..ec2a6d489 100644
--- a/libc/tinymath/catan.c
+++ b/libc/tinymath/catan.c
@@ -145,3 +145,7 @@ double complex catan(double complex z)
 	w = CMPLX(w, 0.25 * log(a));
 	return w;
 }
+
+#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
+__strong_reference(catan, catanl);
+#endif
diff --git a/libc/tinymath/catanl.c b/libc/tinymath/catanl.c
index 098de31e6..491ea59d1 100644
--- a/libc/tinymath/catanl.c
+++ b/libc/tinymath/catanl.c
@@ -2,32 +2,26 @@
 │vi: set et ft=c ts=8 tw=8 fenc=utf-8                                       :vi│
 ╚──────────────────────────────────────────────────────────────────────────────╝
 │                                                                              │
-│  Musl Libc                                                                   │
-│  Copyright © 2005-2014 Rich Felker, et al.                                   │
+│ OpenBSD /usr/src/lib/libm/src/s_catanl.c                                     │
 │                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
+│ Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>                    │
 │                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
+│ Permission to use, copy, modify, and distribute this software for any        │
+│ purpose with or without fee is hereby granted, provided that the above       │
+│ copyright notice and this permission notice appear in all copies.            │
 │                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES     │
+│ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF             │
+│ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR      │
+│ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES       │
+│ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN        │
+│ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/complex.h"
 #include "libc/math.h"
 #include "libc/tinymath/complex.internal.h"
+#if !(LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024)
 
 asm(".ident\t\"\\n\\n\
 OpenBSD libm (ISC License)\\n\
@@ -38,22 +32,6 @@ Copyright 2005-2014 Rich Felker, et. al.\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off
 
-/* origin: OpenBSD /usr/src/lib/libm/src/s_catanl.c */
-/*
- * Copyright (c) 2008 Stephen L. Moshier <steve@moshier.net>
- *
- * Permission to use, copy, modify, and distribute this software for any
- * purpose with or without fee is hereby granted, provided that the above
- * copyright notice and this permission notice appear in all copies.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
- */
 /*
  *      Complex circular arc tangent
  *
@@ -97,13 +75,6 @@ asm(".include \"libc/disclaimer.inc\"");
  * 2.9e-17.  See also clog().
  */
 
-
-#if LDBL_MANT_DIG == 53 && LDBL_MAX_EXP == 1024
-long double complex catanl(long double complex z)
-{
-	return catan(z);
-}
-#else
 static const long double PIL = 3.141592653589793238462643383279502884197169L;
 static const long double DP1 = 3.14159265358979323829596852490908531763125L;
 static const long double DP2 = 1.6667485837041756656403424829301998703007e-19L;
@@ -149,4 +120,4 @@ long double complex catanl(long double complex z)
 	return w;
 }
 
-#endif
+#endif /* long double is long */
diff --git a/libc/tinymath/fmin.c b/libc/tinymath/fmin.c
index a382ebf85..08af674ea 100644
--- a/libc/tinymath/fmin.c
+++ b/libc/tinymath/fmin.c
@@ -26,10 +26,10 @@
  * signed zeroes.
  */
 double fmin(double x, double y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbit(x) != __builtin_signbit(y)) {
-    return __builtin_signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
   }
   return x < y ? x : y;
 }
diff --git a/libc/tinymath/fminf.c b/libc/tinymath/fminf.c
index a00d0a7b0..4f2e3aa00 100644
--- a/libc/tinymath/fminf.c
+++ b/libc/tinymath/fminf.c
@@ -17,6 +17,7 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/math.h"
+#include "libc/tinymath/freebsd.internal.h"
 
 /**
  * Returns minimum of two floats.
@@ -26,10 +27,10 @@
  * signed zeroes.
  */
 float fminf(float x, float y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbitf(x) != __builtin_signbitf(y)) {
-    return __builtin_signbitf(x) ? x : y; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
   }
   return x < y ? x : y;
 }
diff --git a/libc/tinymath/fminl.c b/libc/tinymath/fminl.c
index 98230af5a..ccbf7c049 100644
--- a/libc/tinymath/fminl.c
+++ b/libc/tinymath/fminl.c
@@ -27,10 +27,10 @@
  * signed zeroes.
  */
 long double fminl(long double x, long double y) {
-  if (__builtin_isnan(x)) return y;
-  if (__builtin_isnan(y)) return x;
-  if (__builtin_signbitl(x) != __builtin_signbitl(y)) {
-    return __builtin_signbitl(x) ? x : y; /* C99 Annex F.9.9.2 */
+  if (isnan(x)) return y;
+  if (isnan(y)) return x;
+  if (signbit(x) != signbit(y)) {
+    return signbit(x) ? x : y; /* C99 Annex F.9.9.2 */
   }
   return x < y ? x : y;
 }
diff --git a/libc/tinymath/fsumf.c b/libc/tinymath/fsumf.c
index c0c42ba7e..4be39a87b 100644
--- a/libc/tinymath/fsumf.c
+++ b/libc/tinymath/fsumf.c
@@ -22,8 +22,8 @@
 /**
  * Adds floats in array.
  */
-float fsumf(const float *p, size_t n) {
-  float s;
+double fsumf(const float *p, size_t n) {
+  double s;
   size_t i;
   if (n > 8) return fsumf(p, n / 2) + fsumf(p + n / 2, n - n / 2);
   for (s = i = 0; i < n; ++i) s += p[i];
diff --git a/test/libc/str/longsort_test.c b/test/libc/str/longsort_test.c
index d6ab1d807..78d7a6295 100644
--- a/test/libc/str/longsort_test.c
+++ b/test/libc/str/longsort_test.c
@@ -27,6 +27,18 @@
 #include "libc/testlib/testlib.h"
 #include "third_party/vqsort/vqsort.h"
 
+void InsertionSort(int *A, int n) {
+  for (int i = 1; i < n; i++) {
+    int key = A[i];
+    int j = i - 1;
+    while (j >= 0 && A[j] > key) {
+      A[j + 1] = A[j];
+      j--;
+    }
+    A[j + 1] = key;
+  }
+}
+
 int CompareLong(const void *a, const void *b) {
   const long *x = a;
   const long *y = b;
@@ -145,14 +157,14 @@ int CompareInt(const void *a, const void *b) {
   return 0;
 }
 
-TEST(_intsort, test) {
+TEST(InsertionSort, test) {
   size_t n = 5000;
   int *a = gc(calloc(n, sizeof(int)));
   int *b = gc(calloc(n, sizeof(int)));
   rngset(a, n * sizeof(int), 0, 0);
   memcpy(b, a, n * sizeof(int));
   qsort(a, n, sizeof(int), CompareInt);
-  _intsort(b, n);
+  InsertionSort(b, n);
   ASSERT_EQ(0, memcmp(b, a, n * sizeof(int)));
 }
 
@@ -218,13 +230,14 @@ TEST(radix_sort_int32, test) {
   ASSERT_EQ(0, memcmp(b, a, n * sizeof(int)));
 }
 
-BENCH(_intsort, bench) {
+BENCH(InsertionSort, bench) {
   printf("\n");
   size_t n = 10000;
   int *p1 = gc(malloc(n * sizeof(int)));
   int *p2 = gc(malloc(n * sizeof(int)));
   rngset(p1, n * sizeof(int), 0, 0);
-  EZBENCH2("_intsort", memcpy(p2, p1, n * sizeof(int)), _intsort(p2, n));
+  EZBENCH2("InsertionSort", memcpy(p2, p1, n * sizeof(int)),
+           InsertionSort(p2, n));
 #ifdef __x86_64__
   if (X86_HAVE(AVX2)) {
     EZBENCH2("vqsort_int32_avx2", memcpy(p2, p1, n * sizeof(int)),
diff --git a/test/libc/tinymath/remainder_test.c b/test/libc/tinymath/remainder_test.c
index c8eb912f5..bf5138bbe 100644
--- a/test/libc/tinymath/remainder_test.c
+++ b/test/libc/tinymath/remainder_test.c
@@ -24,7 +24,6 @@
 #include "libc/runtime/runtime.h"
 #include "libc/testlib/ezbench.h"
 #include "libc/testlib/testlib.h"
-#include "libc/tinymath/tinymath.h"
 #include "libc/x/x.h"
 
 float remainderf2(float, float);
diff --git a/third_party/ggml/README.cosmo b/third_party/ggml/README.cosmo
index 8afbd578e..f71191337 100644
--- a/third_party/ggml/README.cosmo
+++ b/third_party/ggml/README.cosmo
@@ -27,3 +27,5 @@ LOCAL CHANGES
   - Refactor headers per cosmo convention
   - Replace code like 'ggjt' with READ32BE("ggjt")
   - Remove C++ exceptions; use Die() function instead
+  - Removed division from matrix multiplication.
+  - Let quantizer convert between ggmt formats
diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc
index e9f95ed1f..d780d4f9f 100644
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@@ -34,6 +34,7 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/fileno.h"
+#include "third_party/ggml/llama_util.h"
 #include "third_party/libcxx/algorithm"
 #include "third_party/libcxx/cassert"
 #include "third_party/libcxx/cstring"
@@ -50,13 +51,6 @@ Copyright (c) 2023 Georgi Gerganov\"");
 asm(".include \"libc/disclaimer.inc\"");
 // clang-format off
 
-static bool is_integer_str(const char *s) {
-    if (*s == '-') ++s;
-    if (!*s) return false;
-    while (isdigit(*s)) ++s;
-    return !*s;
-}
-
 static std::string replace_all(std::string const& original,
                                std::string const& before,
                                std::string const& after) {
@@ -92,7 +86,7 @@ static bool append_file_to_prompt(const char *path, gpt_params & params) {
 }
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75)));
+    params.n_threads = std::min(20, std::max(1, _getcpucount() >> 1));
 
     bool invalid_param = false;
     std::string arg;
diff --git a/third_party/ggml/common.h b/third_party/ggml/common.h
index cffd92363..8cd4562ba 100644
--- a/third_party/ggml/common.h
+++ b/third_party/ggml/common.h
@@ -1,4 +1,4 @@
-// -*- c++ -*-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
 #define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
 #include "libc/calls/struct/termios.h"
@@ -21,7 +21,7 @@
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
     int32_t verbose       = 0;    // Logging verbosity
-    int32_t n_threads     = std::min(1, (int)(_getcpucount() * 0.75));
+    int32_t n_threads     = std::max(1, _getcpucount() >> 1);
     int32_t n_predict     = -1;   // new tokens to predict
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
diff --git a/third_party/ggml/fp16.c b/third_party/ggml/fp16.c
index 2491addd9..71d46d09c 100644
--- a/third_party/ggml/fp16.c
+++ b/third_party/ggml/fp16.c
@@ -78,7 +78,15 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
 }
 
 void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
-    for (size_t i = 0; i < n; i++) {
+    size_t i = 0;
+#ifdef __F16C__
+    for (; i + 7 < n; i += 8) {
+        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
+        __m256 y_vec = _mm256_cvtph_ps(x_vec);
+        _mm256_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i < n; i++) {
         y[i] = GGML_FP16_TO_FP32(x[i]);
     }
 }
diff --git a/third_party/ggml/fp16.h b/third_party/ggml/fp16.h
index 37e746bc2..c544a242b 100644
--- a/third_party/ggml/fp16.h
+++ b/third_party/ggml/fp16.h
@@ -3,9 +3,6 @@
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
 
-#define GGML_GELU_FP16
-#define GGML_SILU_FP16
-
 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
 typedef __fp16 ggml_fp16_t;
diff --git a/third_party/ggml/fp16.internal.h b/third_party/ggml/fp16.internal.h
index 17fe2b99d..b59e28825 100644
--- a/third_party/ggml/fp16.internal.h
+++ b/third_party/ggml/fp16.internal.h
@@ -8,6 +8,9 @@
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
 COSMOPOLITAN_C_START_
 
+#define GGML_GELU_FP16
+#define GGML_SILU_FP16
+
 extern ggml_fp16_t table_gelu_f16[1 << 16];
 extern ggml_fp16_t table_silu_f16[1 << 16];
 extern ggml_fp16_t table_exp_f16[1 << 16];
diff --git a/third_party/ggml/ggjt.v1.q4_0.c b/third_party/ggml/ggjt.v1.q4_0.c
index c445f67a2..3c9476422 100644
--- a/third_party/ggml/ggjt.v1.q4_0.c
+++ b/third_party/ggml/ggjt.v1.q4_0.c
@@ -613,23 +613,37 @@ void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * res
     __m256 acc = _mm256_setzero_ps();
 
     // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
+#define WORK(I) \
+    /* Compute combined scale for the block */ \
+    const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
+    __m256i bx = bytes_from_nibbles_32(x[I].qs); \
+    /* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \
+    const __m256i off = _mm256_set1_epi8( 8 ); \
+    bx = _mm256_sub_epi8( bx, off ); \
+    __m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \
+    const __m256 q = mul_sum_i8_pairs_float(bx, by); \
+    /* Multiply q with scale and accumulate */ \
+    acc = _mm256_fmadd_ps( d, q, acc )
+    int i = 0;
+    for (; i + 12 < nb; i += 12) {
+        _mm_prefetch(x+i+12, 3);
+        _mm_prefetch(x+i+15, 3);
+        _mm_prefetch(x+i+18, 3);
+        _mm_prefetch(x+i+21, 3);
+        _mm_prefetch(y+i+12, 3);
+        _mm_prefetch(y+i+14, 3);
+        _mm_prefetch(y+i+16, 3);
+        _mm_prefetch(y+i+18, 3);
+        _mm_prefetch(y+i+20, 3);
+        _mm_prefetch(y+i+22, 3);
+        for (int j = 0; j < 12; ++j) {
+            WORK(i+j);
+        }
     }
+    for (; i < nb; ++i) {
+        WORK(i);
+    }
+#undef WORK
 
     *s = hsum_float_8(acc);
 #elif defined(__AVX__)
diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c
index 6a3f07401..603bafa20 100644
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
@@ -1784,9 +1784,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
     // Initialize accumulator with zeros
     __m256 acc = _mm256_setzero_ps();
 
-    //
     // Main loop
-    //
 #define WORK(I) \
     /* Compute combined scale for the block */ \
     const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
@@ -2702,9 +2700,15 @@ inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
 
 inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
 #ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
+    int i = 0;
+    ggml_float sum = 0;
+#if __AVX__ || __AVX2__ || __AVX512F__
+    for (; i + 8 <= n; i += 8) {
+        sum += hsum_float_8(_mm256_loadu_ps(x + i));
+    }
+#endif
+    for (; i < n; ++i) {
+        sum += x[i];
     }
     *s = sum;
 #else
@@ -2802,6 +2806,7 @@ const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = {
     [GGML_TYPE_F16]  = "f16",
     [GGML_TYPE_Q4_0] = "q4_0",
     [GGML_TYPE_Q4_1] = "q4_1",
+    [GGML_TYPE_Q4_2] = "q4_2",
     [GGML_TYPE_Q5_0] = "q5_0",
     [GGML_TYPE_Q5_1] = "q5_1",
     [GGML_TYPE_Q8_0] = "q8_0",
@@ -8113,7 +8118,7 @@ static void ggml_compute_forward_alibi_f32(
     assert(ne1 + n_past == ne0); (void) n_past;
 
     // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << _bsr(n_head);
+    const int n_heads_log2_floor = 1 << _bsr(n_head); // [jart]
 
     const float m0 = exp2f(-8.0f / n_heads_log2_floor);
     const float m1 = exp2f(-4.0f / n_heads_log2_floor);
diff --git a/third_party/ggml/ggml.h b/third_party/ggml/ggml.h
index c8f4dc138..5cf9e4de6 100644
--- a/third_party/ggml/ggml.h
+++ b/third_party/ggml/ggml.h
@@ -1,3 +1,4 @@
+// -*- c; c-basic-offset:4 -*-
 #ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
 #define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
 #if !(__ASSEMBLER__ + __LINKER__ + 0)
diff --git a/third_party/ggml/ggml.mk b/third_party/ggml/ggml.mk
index e1c768350..3aee03ac9 100644
--- a/third_party/ggml/ggml.mk
+++ b/third_party/ggml/ggml.mk
@@ -128,6 +128,7 @@ THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =					\
 	LIBC_STR							\
 	LIBC_STUBS							\
 	LIBC_SYSV							\
+	LIBC_SYSV_CALLS							\
 	LIBC_THREAD							\
 	LIBC_TINYMATH							\
 	LIBC_ZIPOS							\
@@ -180,6 +181,7 @@ o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private		\
 
 THIRD_PARTY_GGML_COMS =							\
 	$(THIRD_PARTY_GGML_LLAMA)					\
+	o/$(MODE)/third_party/ggml/quantize.com				\
 	o/$(MODE)/third_party/ggml/perplexity.com
 
 THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
diff --git a/third_party/ggml/llama.cc b/third_party/ggml/llama.cc
index f104bf962..610f54b22 100644
--- a/third_party/ggml/llama.cc
+++ b/third_party/ggml/llama.cc
@@ -31,6 +31,7 @@
 #include "libc/intrin/bits.h"
 #include "libc/macros.internal.h"
 #include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/posix.h"
 #include "third_party/ggml/fp16.h"
 #include "third_party/ggml/ggml.h"
 #include "third_party/ggml/llama_util.h"
@@ -443,8 +444,9 @@ struct llama_file_loader {
     llama_hparams hparams;
     llama_vocab vocab;
 
-    llama_file_loader(const char * fname, size_t file_idx, llama_load_tensors_map & tensors_map)
-        : file(fname, "rb") {
+    llama_file_loader(const char * fname, size_t file_idx,
+                      llama_load_tensors_map & tensors_map)
+            : file(fname, "rb") {
         // fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
         read_magic();
         read_hparams();
@@ -568,8 +570,9 @@ struct llama_file_saver {
         write_vocab();
     }
     void write_magic() {
+        ggjt_v2();
         file.write_u32(READ32BE("ggjt")); // magic
-        file.write_u32(1); // version
+        file.write_u32(2); // version
     }
     void write_hparams(enum llama_ftype new_ftype) {
         const llama_hparams & hparams = any_file_loader->hparams;
@@ -2003,16 +2006,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
                ggml_type_name(tensor.type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D tensors
-        quantize &= (tensor.ne.size() == 2);
-
-        // uncomment this to keep the output layer in FP16
-        //if (tensor.name == "output.weight") {
-        //    quantize = false;
-        //}
+        // only quantize 2d weights that aren't the output layer
+        bool quantize =
+                tensor.ne.size() == 2 &&
+                tensor.type != quantized_type &&
+                _endswith(tensor.name.c_str(), "weight") &&
+                tensor.name != "output.weight";
 
         enum ggml_type new_type;
         void * new_data;
@@ -2024,6 +2023,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             new_data = tensor.data;
             new_size = tensor.size;
             printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
+        } else if (quantized_type == GGML_TYPE_F16) {
+            GGML_ASSERT(tensor.type == GGML_TYPE_F32);
+            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
+            new_type = quantized_type;
+            new_size = nelements * 2;
+            work.resize(new_size);
+            new_data = work.addr;
+            ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
         } else {
             new_type = quantized_type;
             float * f32_data;
diff --git a/third_party/ggml/llama.h b/third_party/ggml/llama.h
index 041794768..9c28a60dc 100644
--- a/third_party/ggml/llama.h
+++ b/third_party/ggml/llama.h
@@ -1,4 +1,4 @@
-// -*- c++ -*-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef LLAMA_H
 #define LLAMA_H
 #include "libc/intrin/bits.h"
diff --git a/third_party/ggml/llama_util.h b/third_party/ggml/llama_util.h
index 05184945d..ae43af616 100755
--- a/third_party/ggml/llama_util.h
+++ b/third_party/ggml/llama_util.h
@@ -1,12 +1,11 @@
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef LLAMA_UTIL_H
 #define LLAMA_UTIL_H
 #include "libc/calls/struct/rlimit.h"
 #include "libc/dce.h"
 #include "libc/fmt/fmt.h"
 #include "libc/runtime/sysconf.h"
+#include "libc/str/str.h"
 #include "libc/sysv/consts/madv.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/prot.h"
@@ -22,6 +21,9 @@
 #include "third_party/libcxx/vector"
 // clang-format off
 
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+
 #define LLAMA_ASSERT(x) \
     do { \
         if (!(x)) { \
@@ -47,6 +49,13 @@ static void Die(const char *fmt, ...) {
     exit(1);
 }
 
+static inline bool is_integer_str(const char *s) {
+    if (*s == '-') ++s;
+    if (!*s) return false;
+    while (isdigit(*s)) ++s;
+    return !*s;
+}
+
 struct llama_file {
     // use FILE * so we don't have to re-open the file to mmap
     FILE * fp;
diff --git a/third_party/ggml/main.cc b/third_party/ggml/main.cc
index cb8f1b845..8a399f825 100644
--- a/third_party/ggml/main.cc
+++ b/third_party/ggml/main.cc
@@ -28,6 +28,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/assert.h"
 #include "libc/calls/calls.h"
+#include "libc/calls/struct/sched_param.h"
 #include "libc/calls/struct/sigaction.h"
 #include "libc/calls/struct/stat.h"
 #include "libc/fmt/fmt.h"
@@ -37,9 +38,11 @@
 #include "libc/nexgen32e/x86feature.h"
 #include "libc/runtime/runtime.h"
 #include "libc/stdio/stdio.h"
+#include "libc/sysv/consts/ioprio.h"
 #include "libc/sysv/consts/map.h"
 #include "libc/sysv/consts/msync.h"
 #include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/prio.h"
 #include "libc/sysv/consts/prot.h"
 #include "libc/sysv/consts/sig.h"
 #include "third_party/ggml/common.h"
@@ -66,7 +69,6 @@ static int n_past;
 static int n_remain;
 static int n_consumed;
 static bool input_noecho;
-static bool is_antiprompt;
 
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -103,7 +105,7 @@ static int CompareTime(struct timespec a, struct timespec b) {
 ////////////////////////////////////////////////////////////////////////////////
 // ux explanatory logging for llama.com developers
 
-#if 1
+#if 0
 #define DEVLOG(...) (void)0
 #else
 #define DEVLOG(...) if (g_devlog) fprintf(g_devlog, __VA_ARGS__)
@@ -187,16 +189,16 @@ static bool has_antiprompt(std::string::size_type *out_index = nullptr,
 static void finish_initializing_prompt() {
     prompt_status = kPromptFinished;
     if (params.interactive) {
-        std::string::size_type pos;
+        std::string::size_type ap_index;
         is_interacting = true;
-        if (has_antiprompt(&pos)) {
+        if (has_antiprompt(&ap_index)) {
             console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            printf("%s", last_output.substr(pos).c_str());
-            last_output.clear();
+            printf("%s", last_output.substr(ap_index).c_str());
             fflush(stdout);
         }
         console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
     }
+    last_output.clear();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -208,8 +210,16 @@ static int on_missing_feature(const char *name) {
     return 1;
 }
 
+void MakeProcessNice(void) {
+    setpriority(PRIO_PROCESS, 0, 10);
+    ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+    struct sched_param param = {sched_get_priority_min(SCHED_IDLE)};
+    sched_setscheduler(0, SCHED_IDLE, &param);
+}
+
 int main(int argc, char ** argv) {
 
+    MakeProcessNice();
     ShowCrashReports();
     setvbuf(stdin, NULL, _IONBF, 0);
     setvbuf(stdout, NULL, _IONBF, 0);
@@ -439,8 +449,7 @@ int main(int argc, char ** argv) {
 
     remember_init();
 
-    is_antiprompt = false;
-    input_noecho  = params.verbose <= 0;
+    input_noecho = params.verbose <= 0;
 
     n_past     = 0;
     n_remain   = params.n_predict;
@@ -561,6 +570,9 @@ int main(int argc, char ** argv) {
         fprintf(stderr, EPHEMERAL("loading weights..."));
     }
 
+    // tracks if last character written to stdout was newline
+    bool got_newline = false;
+
     while ((n_remain != 0 || params.interactive) && !is_terminated) {
 
         // perform evaluation
@@ -678,7 +690,8 @@ int main(int argc, char ** argv) {
             finish_initializing_prompt();
         }
 
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+        if (prompt_status == kPromptFinished &&
+            (int) embd_inp.size() <= n_consumed && !is_interacting) {
             // out of user input, sample next token
             DEVLOG("out of user input, sample next token w/ embd_inp.size()=%d n_consumed=%d\n",
                    (int)embd_inp.size(), n_consumed);
@@ -808,21 +821,25 @@ int main(int argc, char ** argv) {
         //       --prompt 'Question: How old are you?\nAnswer: '
         //       --reverse-prompt $'\n'
         //
+        bool is_antiprompt;
         std::string ap_text;
-        std::string::size_type ap_index;
         std::string::size_type ap_extra;
-        is_antiprompt = has_antiprompt(&ap_index, &ap_text);
+        std::string::size_type ap_index;
+        if (prompt_status == kPromptFinished) {
+            is_antiprompt = has_antiprompt(&ap_index, &ap_text);
+        } else {
+            is_antiprompt = false;
+        }
 
         // display text
-        bool got_newline = false;
-        if (!input_noecho) {
+        if (!input_noecho && embd.size()) {
             std::string printme;
             for (auto id : embd) {
                 printme.append(llama_token_to_str(ctx, id));
             }
             if (is_antiprompt) {
-                ap_extra = last_output.size() - (ap_index + ap_text.size());
-                printme.erase(printme.size() - MIN(printme.size(), ap_extra));
+                ap_extra = last_output.size() - ap_index;
+                printme.erase(std::max(0, (int)(printme.size() - ap_extra)));
             }
             if (printme.size()) {
                 got_newline = printme[printme.size() - 1] == '\n';
@@ -832,6 +849,7 @@ int main(int argc, char ** argv) {
         }
         if (is_antiprompt) {
             if (!params.interactive) {
+                DEVLOG("exiting due to antiprompt\n");
                 if (!got_newline) {
                     printf("\n");
                 }
diff --git a/third_party/ggml/quantize.cc b/third_party/ggml/quantize.cc
index fe5a0c8f5..1472051bb 100644
--- a/third_party/ggml/quantize.cc
+++ b/third_party/ggml/quantize.cc
@@ -25,9 +25,12 @@
 │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
 │                                                                              │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/log/log.h"
+#include "libc/runtime/runtime.h"
 #include "third_party/ggml/common.h"
 #include "third_party/ggml/ggml.h"
 #include "third_party/ggml/llama.h"
+#include "third_party/ggml/llama_util.h"
 #include "third_party/libcxx/map"
 #include "third_party/libcxx/vector"
 
@@ -38,46 +41,26 @@ asm(".include \"libc/disclaimer.inc\"");
 // clang-format off
 
 static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
-  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
-  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+    {"f16",  LLAMA_FTYPE_MOSTLY_F16 },
+    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
+    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
 };
 
-bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
-    auto it = LLAMA_FTYPE_MAP.find(ftype_str);
-    if (it != LLAMA_FTYPE_MAP.end()) {
-        ftype = it->second;
-        ftype_str_out = it->first;
-        return true;
-    }
-    // try to parse as an integer
-    // try {
-        int ftype_int = std::stoi(ftype_str);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
-            if (it->second == ftype_int) {
-                ftype = it->second;
-                ftype_str_out = it->first;
-                return true;
-            }
-        }
-    // }
-    // catch (...) {
-    //     // stoi failed
-    // }
-    return false;
-}
-
 // usage:
-//  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads]
 //
 int main(int argc, char ** argv) {
+    ShowCrashReports();
+
+    ggjt_v2();
     ggml_time_init();
 
     if (argc < 3) {
-        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthreads]\n", argv[0]);
         for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
             fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
         }
@@ -91,60 +74,27 @@ int main(int argc, char ** argv) {
         ggml_free(ctx);
     }
 
-    // parse command line arguments
     const std::string fname_inp = argv[1];
-    std::string fname_out;
-    int nthread;
-    llama_ftype ftype;
+    const std::string fname_out = argv[2];
 
-    int arg_idx = 2;
-    std::string ftype_str;
-    if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
-        // argv[2] is the ftype
-        std::string fpath;
-        const size_t pos = fname_inp.find_last_of('/');
-        if (pos != std::string::npos) {
-            fpath = fname_inp.substr(0, pos + 1);
-        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
-        arg_idx++;
-    }
-    else {
-        // argv[2] is the output path
-        fname_out = argv[arg_idx];
-        arg_idx++;
-
-        if (argc <= arg_idx) {
-            fprintf(stderr, "%s: missing ftype\n", __func__);
-            return 1;
-        }
-        // argv[3] is the ftype
-        if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
-            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        arg_idx++;
+    if (fname_inp == fname_out) {
+        fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
+        exit(1);
     }
 
-    // parse nthreads
-    if (argc > arg_idx) {
-        // try {
-            nthread = std::stoi(argv[arg_idx]);
-        // }
-        // catch (const std::exception & e) {
-        //     Die("%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
-        //     return 1;
-        // }
+    enum llama_ftype ftype;
+    if (!is_integer_str(argv[3])) {
+        auto it = LLAMA_FTYPE_MAP.find(argv[3]);
+        if (it == LLAMA_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        ftype = it->second;
     } else {
-        nthread = 0;
+        ftype = (enum llama_ftype)atoi(argv[3]);
     }
 
-    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
-    if (nthread > 0) {
-        fprintf(stderr, " using %d threads", nthread);
-    }
-    fprintf(stderr, "\n");
+    int nthread = argc > 4 ? atoi(argv[4]) : 0;
 
     const int64_t t_main_start_us = ggml_time_us();
 
diff --git a/third_party/intel/f16cintrin.internal.h b/third_party/intel/f16cintrin.internal.h
index 67337d68c..71f09ec9e 100644
--- a/third_party/intel/f16cintrin.internal.h
+++ b/third_party/intel/f16cintrin.internal.h
@@ -18,10 +18,18 @@ __funline float _cvtsh_ss(unsigned short __S) {
   return __builtin_ia32_vec_ext_v4sf(__A, 0);
 }
 
+/**
+ * Converts four half-precision (16-bit) floating point values to
+ * single-precision floating point values.
+ */
 __funline __m128 _mm_cvtph_ps(__m128i __A) {
   return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__A);
 }
 
+/**
+ * Converts eight half-precision (16-bit) floating point values to
+ * single-precision floating point values.
+ */
 __funline __m256 _mm256_cvtph_ps(__m128i __A) {
   return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__A);
 }
@@ -37,6 +45,10 @@ __funline __m128i _mm_cvtps_ph(__m128 __A, const int __I) {
   return (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)__A, __I);
 }
 
+/**
+ * Converts eight single-precision floating point values to
+ * half-precision (16-bit) floating point values.
+ */
 __funline __m128i _mm256_cvtps_ph(__m256 __A, const int __I) {
   return (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__A, __I);
 }
diff --git a/third_party/radpajama/common-gptneox.h b/third_party/radpajama/common-gptneox.h
index 004a95d5e..05e285617 100644
--- a/third_party/radpajama/common-gptneox.h
+++ b/third_party/radpajama/common-gptneox.h
@@ -1,4 +1,4 @@
-// -*- c++ -*-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
 #define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
 #include "libc/macros.internal.h"
diff --git a/third_party/radpajama/gptneox-util.h b/third_party/radpajama/gptneox-util.h
index c05b3333b..817ef6066 100644
--- a/third_party/radpajama/gptneox-util.h
+++ b/third_party/radpajama/gptneox-util.h
@@ -1,4 +1,4 @@
-// -*- c++ -*-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef GPTNEOX_UTIL_H
 #define GPTNEOX_UTIL_H
 #include "libc/calls/calls.h"
diff --git a/third_party/radpajama/gptneox.cc b/third_party/radpajama/gptneox.cc
index bf142851b..66152e1e4 100644
--- a/third_party/radpajama/gptneox.cc
+++ b/third_party/radpajama/gptneox.cc
@@ -28,6 +28,8 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "third_party/radpajama/gptneox.h"
 #include "libc/intrin/bits.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/posix.h"
 #include "third_party/ggml/fp16.h"
 #include "third_party/ggml/ggml.h"
 #include "third_party/ggml/llama_util.h"
@@ -77,7 +79,7 @@ static const size_t MiB = 1024*1024;
 //       needs modifications in ggml
 
 // TODO: Modify for gptneox, how are these values actually determined?
-// TODO: This is now priority, 
+// TODO: This is now priority,
 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
     static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
@@ -446,7 +448,8 @@ struct gptneox_load_tensors_map {
 enum gptneox_file_version {
     GPTNEOX_FILE_VERSION_GGML,
     GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
-    GPTNEOX_FILE_VERSION_GGJT_V1, // added padding
+    GPTNEOX_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout
+    GPTNEOX_FILE_VERSION_GGJT_V2, // changed quantization format
 };
 
 struct gptneox_file_loader {
@@ -473,10 +476,16 @@ struct gptneox_file_loader {
 
         if (magic == READ32BE("ggml") && version == 0) {
             file_version = GPTNEOX_FILE_VERSION_GGML;
+            ggjt_v1();
         } else if (magic == READ32BE("ggmf") && version == 1) {
             file_version = GPTNEOX_FILE_VERSION_GGMF_V1;
+            ggjt_v1();
         } else if (magic == READ32BE("ggjt") && version == 1) {
             file_version = GPTNEOX_FILE_VERSION_GGJT_V1;
+            ggjt_v1();
+        } else if (magic == READ32BE("ggjt") && version == 2) {
+            file_version = GPTNEOX_FILE_VERSION_GGJT_V2;
+            ggjt_v2();
         } else {
             Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                          magic, version);
@@ -566,17 +575,20 @@ struct gptneox_file_loader {
 struct gptneox_file_saver {
     gptneox_file file;
     gptneox_file_loader * any_file_loader;
-    gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype)
-        : file(fname, "wb"), any_file_loader(any_file_loader) {
+    gptneox_file_saver(const char * fname,
+                       gptneox_file_loader * any_file_loader,
+                       enum gptneox_ftype new_ftype)
+            : file(fname, "wb"),
+              any_file_loader(any_file_loader) {
         fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname);
-        ggjt_v1();
         write_magic();
         write_hparams(new_ftype);
         write_vocab();
     }
     void write_magic() {
+        ggjt_v2();
         file.write_u32(READ32BE("ggjt")); // magic
-        file.write_u32(1); // version
+        file.write_u32(2); // version
     }
     void write_hparams(enum gptneox_ftype new_ftype) {
         const gptneox_hparams & hparams = any_file_loader->hparams;
@@ -887,7 +899,8 @@ static const char *gptneox_file_version_name(gptneox_file_version version) {
     switch (version) {
         case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
         case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
-        case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
+        case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
+        case GPTNEOX_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
         default: GPTNEOX_ASSERT(false);
     }
 }
@@ -940,7 +953,7 @@ static void gptneox_model_load_internal(
     model.hparams = ml->file_loaders.at(0)->hparams;
     gptneox_file_version file_version = ml->file_loaders.at(0)->file_version;
     auto & hparams = model.hparams;
-    
+
     {
         switch (hparams.n_layer) {
             case 16: {
@@ -951,7 +964,7 @@ static void gptneox_model_load_internal(
                 }
                 break;
             }
-            // # <RedPajama>: we extend the model type settings for RedPajama models.  
+            // # <RedPajama>: we extend the model type settings for RedPajama models.
             case 32:{
                 if (hparams.n_embd == 2560) {
                     model.type = e_model::MODEL_3B;
@@ -1195,7 +1208,7 @@ static bool gptneox_eval_internal(
                                     model.layers[il].c_attn_attn_b, cur),
                         cur);
             }
-             
+
             // Split QKV and make contiguous
             struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur,
                                             n_embd/n_head,
@@ -1225,7 +1238,7 @@ static bool gptneox_eval_internal(
                         ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
             Vcur = ggml_cpy(ctx0, Vcur,
                         ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
-            
+
             // MARK: gptneox RoPE Q and K, before cache
             // Bit 2 for gptneox style (2)
             // Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only)
@@ -1241,7 +1254,7 @@ static bool gptneox_eval_internal(
                             ggml_element_size(Vcur) * n_embd,
                             0);
                 Vcur = ggml_transpose(ctx0, Vcur);
-            
+
                 struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
                                             n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less)
                                             ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past));
@@ -1250,12 +1263,12 @@ static bool gptneox_eval_internal(
                                             n_embd,
                                             ggml_element_size(kv_self.v) * n_ctx,
                                             ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past));
-            
+
                 // important: storing RoPE-ed version of K in the KV cache!
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
             //}
-            
+
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
@@ -1284,7 +1297,7 @@ static bool gptneox_eval_internal(
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
             // KQ = soft_max(KQ_masked)
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            
+
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v,
                                                 n_past + N,
@@ -1312,10 +1325,10 @@ static bool gptneox_eval_internal(
         }
 
         lctx.use_buf(ctx0, 1);
-        
+
         if (hparams.use_parallel_residual == 1) {
             //printf("use_parallel_residual == 1\n");
-            
+
             // This is independent of the self-attention result, so it could be done in parallel to the self-attention
             struct ggml_tensor * outAttn = cur;
 
@@ -1359,7 +1372,7 @@ static bool gptneox_eval_internal(
             inpL = ggml_add(ctx0, inpL, cur);
         } else if (hparams.use_parallel_residual == 0) {
             //printf("use_parallel_residual == 0\n");
-            
+
             // This takes the self-attention residual output as input to Feedforward
             struct ggml_tensor * outAttn = cur;
             struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL);
@@ -2093,6 +2106,7 @@ int gptneox_model_copy(
 static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) {
     ggml_type quantized_type;
     switch (ftype) {
+        case GPTNEOX_FTYPE_MOSTLY_F16:  quantized_type = GGML_TYPE_F16;  break;
         case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
         case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
         case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
@@ -2124,21 +2138,17 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
         tensor.data = read_data.addr;
         model_loader->load_data_for(tensor);
 
-        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
+        printf("[%4zu/%4zu] %50s - %16s, type = %6s, ",
                ++idx, model_loader->tensors_map.tensors.size(),
                tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(),
                ggml_type_name(tensor.type));
 
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D tensors
-        quantize &= (tensor.ne.size() == 2);
-
-        // uncomment this to keep the output layer in FP16
-        //if (tensor.name == "output.weight") {
-        //    quantize = false;
-        //}
+        // only quantize 2d weights that aren't the output layer
+        bool quantize =
+                tensor.ne.size() == 2 &&
+                tensor.type != quantized_type &&
+                _endswith(tensor.name.c_str(), "weight") &&
+                tensor.name != "output.weight";
 
         enum ggml_type new_type;
         void * new_data;
@@ -2150,6 +2160,14 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
             new_data = tensor.data;
             new_size = tensor.size;
             printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0);
+        } else if (quantized_type == GGML_TYPE_F16) {
+            GPTNEOX_ASSERT(tensor.type == GGML_TYPE_F32);
+            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
+            new_type = quantized_type;
+            new_size = nelements * 2;
+            work.resize(new_size);
+            new_data = work.addr;
+            ggml_fp32_to_fp16_row((const float *)tensor.data, (ggml_fp16_t *)new_data, nelements);
         } else {
             new_type = quantized_type;
             float * f32_data;
diff --git a/third_party/radpajama/gptneox.h b/third_party/radpajama/gptneox.h
index 25755e127..aaf62fcc9 100644
--- a/third_party/radpajama/gptneox.h
+++ b/third_party/radpajama/gptneox.h
@@ -1,4 +1,4 @@
-// -*- c++ -*-
+// -*- c++; c-basic-offset:4 -*-
 #ifndef GPTNEOX_H
 #define GPTNEOX_H
 // clang-format off
diff --git a/third_party/radpajama/quantize-gptneox.cc b/third_party/radpajama/quantize-gptneox.cc
index 4d68f8325..826117bae 100644
--- a/third_party/radpajama/quantize-gptneox.cc
+++ b/third_party/radpajama/quantize-gptneox.cc
@@ -28,6 +28,7 @@
 ╚─────────────────────────────────────────────────────────────────────────────*/
 #include "libc/log/log.h"
 #include "third_party/ggml/ggml.h"
+#include "third_party/ggml/llama_util.h"
 #include "third_party/libcxx/cstdio"
 #include "third_party/libcxx/map"
 #include "third_party/libcxx/string"
@@ -35,13 +36,14 @@
 // clang-format off
 
 static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
-  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
-  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
-  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
+    {"f16", GPTNEOX_FTYPE_MOSTLY_F16},
+    {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
+    //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
+    {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
 };
 
 // usage:
@@ -50,7 +52,7 @@ static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
 int main(int argc, char ** argv) {
     ShowCrashReports();
 
-    ggjt_v1();
+    ggjt_v2();
     ggml_time_init();
 
     if (argc < 4) {
@@ -71,8 +73,13 @@ int main(int argc, char ** argv) {
     const std::string fname_inp = argv[1];
     const std::string fname_out = argv[2];
 
+    if (fname_inp == fname_out) {
+        fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
+        exit(1);
+    }
+
     enum gptneox_ftype ftype;
-    if (argv[3][0] == 'q') {
+    if (!is_integer_str(argv[3])) {
         auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
         if (it == GPTNEOX_FTYPE_MAP.end()) {
             fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
diff --git a/tool/emacs/cosmo-c-builtins.el b/tool/emacs/cosmo-c-builtins.el
index e54408191..ae255a493 100644
--- a/tool/emacs/cosmo-c-builtins.el
+++ b/tool/emacs/cosmo-c-builtins.el
@@ -72,6 +72,7 @@
            "__builtin_extract_return_addr"
            "__builtin_isnan"
            "__builtin_signbit"
+           "__builtin_signbitf"
            "__builtin_signbitl"
            "__builtin_ffs"
            "__builtin_ffsl"