Introduce support for GGJT v3 file format

llama.com can now load weights that use the new file format which was introduced a few weeks ago. Note that, unlike llama.cpp, we will keep support for old file formats in our tool so you don't need to convert your weights when the upstream project makes breaking changes. Please note that using ggjt v3 does make avx2 inference go 5% faster for me.
2025-10-25 18:50:57 +00:00 · 2023-06-03 13:48:52 -07:00 · 2023-06-03 13:48:52 -07:00 · 8fdb31681a
commit 8fdb31681a
parent 6ae18a10ba
33 changed files with 3829 additions and 371 deletions
--- a/libc/log/log.h
+++ b/libc/log/log.h
@ -43,6 +43,7 @@ const char *commandvenv(const char *, const char *);
 const char *GetAddr2linePath(void);
 const char *GetGdbPath(void);
 void ShowCrashReports(void);
+int MakeProcessNice(void);
 bool32 IsDebuggerPresent(bool);
 bool IsRunningUnderMake(void);
 const char *GetSiCodeName(int, int);
--- a/libc/log/makeprocessnice.c
+++ b/libc/log/makeprocessnice.c
@ -0,0 +1,41 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=2 sts=2 sw=2 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/sched_param.h"
+#include "libc/errno.h"
+#include "libc/log/log.h"
+#include "libc/sysv/consts/ioprio.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/sched.h"
+
+/**
+ * Makes current process as low-priority as possible.
+ *
+ * @return 0 on success, or -1 w/ errno
+ * @note error reporting currently not implemented
+ */
+int MakeProcessNice(void) {
+  int e = errno;
+  setpriority(PRIO_PROCESS, 0, 10);
+  ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+  struct sched_param param = {sched_get_priority_min(SCHED_IDLE)};
+  sched_setscheduler(0, SCHED_IDLE, &param);
+  errno = e;
+  return 0;
+}
--- a/libc/log/oncrash_arm64.c
+++ b/libc/log/oncrash_arm64.c
@ -230,7 +230,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) {
      // simply examining the program counter.
      pc = ctx->uc_mcontext.pc;
      Append(b, " %016lx sp %lx pc", ctx->uc_mcontext.sp, pc);
-      if (pc && (symbol = __get_symbol(st, pc))) {
+      if (pc && st && (symbol = __get_symbol(st, pc))) {
        addend = pc - st->addr_base;
        addend -= st->symbols[symbol].x;
        Append(b, " ");
@ -251,7 +251,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) {
      fp = (struct StackFrame *)ctx->uc_mcontext.regs[29];
      if (IsCode((pc = ctx->uc_mcontext.regs[30]))) {
        Append(b, " %016lx sp %lx lr", ctx->uc_mcontext.sp, pc);
-        if (pc && (symbol = __get_symbol(st, pc))) {
+        if (pc && st && (symbol = __get_symbol(st, pc))) {
          addend = pc - st->addr_base;
          addend -= st->symbols[symbol].x;
          Append(b, " ");
@ -282,7 +282,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) {
          Append(b, " <truncated backtrace>\n");
          break;
        }
-        if ((pc = fp->addr)) {
+        if (st && (pc = fp->addr)) {
          if ((symbol = __get_symbol(st, pc))) {
            addend = pc - st->addr_base;
            addend -= st->symbols[symbol].x;
@ -294,7 +294,7 @@ relegated void __oncrash_arm64(int sig, struct siginfo *si, void *arg) {
          addend = 0;
        }
        Append(b, " %016lx fp %lx lr ", fp, pc);
-        if (!AppendFileLine(b, addr2line, debugbin, pc)) {
+        if (!AppendFileLine(b, addr2line, debugbin, pc) && st) {
          Append(b, "%s", __get_symbol_name(st, symbol));
          if (addend) Append(b, "%+d", addend);
        }
--- a/libc/runtime/finddebugbinary.c
+++ b/libc/runtime/finddebugbinary.c
@ -16,12 +16,69 @@
 │ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
+#include "ape/sections.internal.h"
 #include "libc/calls/calls.h"
+#include "libc/elf/def.h"
+#include "libc/elf/elf.h"
+#include "libc/elf/struct/ehdr.h"
+#include "libc/elf/struct/sym.h"
+#include "libc/errno.h"
 #include "libc/intrin/bits.h"
 #include "libc/macros.internal.h"
 #include "libc/runtime/runtime.h"
 #include "libc/runtime/symbols.internal.h"
 #include "libc/str/str.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/prot.h"
+
+static bool GetElfSymbolValue(const Elf64_Ehdr *ehdr, size_t esize,
+                              const char *name, uint64_t *res) {
+  Elf64_Xword i, n;
+  const char *stab;
+  const Elf64_Sym *st;
+  if ((stab = GetElfStringTable(ehdr, esize)) &&
+      (st = GetElfSymbolTable(ehdr, esize, &n))) {
+    for (i = 0; i < n; ++i) {
+      if (!strcmp(stab + st[i].st_name, name)) {
+        *res = st[i].st_value;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+static bool IsMyDebugBinaryImpl(const char *path) {
+  int fd;
+  void *map;
+  int64_t size;
+  uintptr_t value;
+  bool res = false;
+  if ((fd = open(path, O_RDONLY | O_CLOEXEC, 0)) != -1) {
+    // sanity test that this .com.dbg file (1) is an elf image, and (2)
+    // contains the same number of bytes of code as our .com executable
+    // which is currently running in memory.
+    if ((size = lseek(fd, 0, SEEK_END)) != -1 &&
+        (map = mmap(0, size, PROT_READ, MAP_SHARED, fd, 0)) != MAP_FAILED) {
+      if (GetElfSymbolValue(map, size, "_etext", &value)) {
+        res = !_etext || value == (uintptr_t)_etext;
+      }
+      munmap(map, size);
+    }
+    close(fd);
+  }
+  return res;
+}
+
+static bool IsMyDebugBinary(const char *path) {
+  int e;
+  bool res;
+  e = errno;
+  res = IsMyDebugBinaryImpl(path);
+  errno = e;
+  return res;
+}

 /**
 * Returns path of binary with the debug information, or null.
@ -42,12 +99,12 @@ const char *FindDebugBinary(void) {
    } else if (n > 4 && READ32LE(p + n - 4) == READ32LE(".com") &&
               n + 4 < ARRAYLEN(buf)) {
      mempcpy(mempcpy(buf, p, n), ".dbg", 5);
-      if (fileexists(buf)) {
+      if (IsMyDebugBinary(buf)) {
        res = buf;
      }
    } else if (n + 8 < ARRAYLEN(buf)) {
      mempcpy(mempcpy(buf, p, n), ".com.dbg", 9);
-      if (fileexists(buf)) {
+      if (IsMyDebugBinary(buf)) {
        res = buf;
      }
    }
--- a/third_party/ggml/README.cosmo
+++ b/third_party/ggml/README.cosmo
@ -9,13 +9,11 @@ LICENSE
 ORIGIN

  https://github.com/ggerganov/llama.cpp
-  commit 0b2da20538d01926b77ea237dd1c930c4d20b686
-  Author: Stephan Walter <stephan@walter.name>
-  Date:   Wed Apr 26 20:26:42 2023 +0000
-  ggml : slightly faster AVX2 implementation for Q5 (#1197)
+  d8bd0013e8768aaa3dc9cfc1ff01499419d5348e

 LOCAL CHANGES

+  - Maintaining support for deprecated file formats
  - Make it possible for loaded prompts to be cached to disk
  - Introduce -v and --verbose flags
  - Reduce batch size from 512 to 32
@ -25,7 +23,6 @@ LOCAL CHANGES
  - Change --reverse-prompt to no longer imply --interactive
  - Permit --reverse-prompt specifying custom EOS if non-interactive
  - Refactor headers per cosmo convention
-  - Replace code like 'ggjt' with READ32BE("ggjt")
  - Remove C++ exceptions; use Die() function instead
  - Removed division from matrix multiplication.
  - Let quantizer convert between ggmt formats
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@ -34,6 +34,7 @@
 #include "libc/stdio/stdio.h"
 #include "libc/str/str.h"
 #include "libc/sysv/consts/fileno.h"
+#include "third_party/ggml/llama.h"
 #include "third_party/ggml/llama_util.h"
 #include "third_party/libcxx/algorithm"
 #include "third_party/libcxx/cassert"
@ -258,6 +259,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
@ -425,6 +437,10 @@ void gpt_print_usage(FILE *f, int /*argc*/, char ** argv, const gpt_params & par
    if (llama_mmap_supported()) {
        fprintf(f, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stderr, "                        number of layers to store in VRAM\n");
+#endif
    fprintf(f, "  --mtest               compute maximum memory usage\n");
    fprintf(f, "  --verbose-prompt      print prompt before generation\n");
    fprintf(f, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
--- a/third_party/ggml/common.h
+++ b/third_party/ggml/common.h
@ -27,6 +27,7 @@ struct gpt_params {
    int32_t n_ctx         = 512;  // context size
    int32_t n_batch       = 32;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers  = 0;    // number of layers to store in VRAM

    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/third_party/ggml/ggjt.v2.c
+++ b/third_party/ggml/ggjt.v2.c
@ -0,0 +1,152 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/assert.h"
+#include "third_party/ggml/ggjt.v2.q4_0.h"
+#include "third_party/ggml/ggjt.v2.q4_1.h"
+#include "third_party/ggml/ggjt.v2.q5_0.h"
+#include "third_party/ggml/ggjt.v2.q5_1.h"
+#include "third_party/ggml/ggjt.v2.q8_0.h"
+#include "third_party/ggml/ggjt.v2.q8_1.h"
+#include "third_party/ggml/ggml.h"
+// clang-format off
+
+static const int ggjt_v2_blck_size[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32]  = 1,
+    [GGML_TYPE_F16]  = 1,
+    [GGML_TYPE_Q4_0] = V2_QK4_0,
+    [GGML_TYPE_Q4_1] = V2_QK4_1,
+    [GGML_TYPE_Q5_0] = V2_QK5_0,
+    [GGML_TYPE_Q5_1] = V2_QK5_1,
+    [GGML_TYPE_Q8_0] = V2_QK8_0,
+    [GGML_TYPE_Q8_1] = V2_QK8_1,
+    [GGML_TYPE_I8]   = 1,
+    [GGML_TYPE_I16]  = 1,
+    [GGML_TYPE_I32]  = 1,
+};
+
+static const size_t ggjt_v2_type_size[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32]  = sizeof(float),
+    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
+    [GGML_TYPE_Q4_0] = sizeof(block_v2_q4_0),
+    [GGML_TYPE_Q4_1] = sizeof(block_v2_q4_1),
+    [GGML_TYPE_Q5_0] = sizeof(block_v2_q5_0),
+    [GGML_TYPE_Q5_1] = sizeof(block_v2_q5_1),
+    [GGML_TYPE_Q8_0] = sizeof(block_v2_q8_0),
+    [GGML_TYPE_Q8_1] = sizeof(block_v2_q8_1),
+    [GGML_TYPE_I8]   = sizeof(int8_t),
+    [GGML_TYPE_I16]  = sizeof(int16_t),
+    [GGML_TYPE_I32]  = sizeof(int32_t),
+};
+
+static const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32]  = "f32",
+    [GGML_TYPE_F16]  = "f16",
+    [GGML_TYPE_Q4_0] = "q4_0",
+    [GGML_TYPE_Q4_1] = "q4_1",
+    [GGML_TYPE_Q4_2] = "q4_2",
+    [GGML_TYPE_Q5_0] = "q5_0",
+    [GGML_TYPE_Q5_1] = "q5_1",
+    [GGML_TYPE_Q8_0] = "q8_0",
+    [GGML_TYPE_Q8_1] = "q8_1",
+    [GGML_TYPE_I8]   = "i8",
+    [GGML_TYPE_I16]  = "i16",
+    [GGML_TYPE_I32]  = "i32",
+};
+
+static const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_F32]  = false,
+    [GGML_TYPE_F16]  = false,
+    [GGML_TYPE_Q4_0] = true,
+    [GGML_TYPE_Q4_1] = true,
+    [GGML_TYPE_Q5_0] = true,
+    [GGML_TYPE_Q5_1] = true,
+    [GGML_TYPE_Q8_0] = true,
+    [GGML_TYPE_Q8_1] = true,
+    [GGML_TYPE_I8]   = false,
+    [GGML_TYPE_I16]  = false,
+    [GGML_TYPE_I32]  = false,
+};
+
+static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_Q4_0] = (void *)ggml_quantize_v2_q4_0,
+    [GGML_TYPE_Q4_1] = (void *)ggml_quantize_v2_q4_1,
+    [GGML_TYPE_Q5_0] = (void *)ggml_quantize_v2_q5_0,
+    [GGML_TYPE_Q5_1] = (void *)ggml_quantize_v2_q5_1,
+    [GGML_TYPE_Q8_0] = (void *)ggml_quantize_v2_q8_0,
+};
+
+static const quantize_fns_t ggjt_v2_quantize_fns[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_Q4_0] = {
+        .dequantize_row_q         = dequantize_row_v2_q4_0,
+        .quantize_row_q           = quantize_row_v2_q4_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_0_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_0,
+        .vec_dot_q                = ggml_vec_dot_v2_q4_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q4_1] = {
+        .dequantize_row_q         = dequantize_row_v2_q4_1,
+        .quantize_row_q           = quantize_row_v2_q4_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_1_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_1,
+        .vec_dot_q                = ggml_vec_dot_v2_q4_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+    [GGML_TYPE_Q5_0] = {
+        .dequantize_row_q         = dequantize_row_v2_q5_0,
+        .quantize_row_q           = quantize_row_v2_q5_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_0_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_0,
+        .vec_dot_q                = ggml_vec_dot_v2_q5_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q5_1] = {
+        .dequantize_row_q         = dequantize_row_v2_q5_1,
+        .quantize_row_q           = quantize_row_v2_q5_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_1_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_1,
+        .vec_dot_q                = ggml_vec_dot_v2_q5_1_q8_1,
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+    [GGML_TYPE_Q8_0] = {
+        .dequantize_row_q         = dequantize_row_v2_q8_0,
+        .quantize_row_q           = quantize_row_v2_q8_0,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_0_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_0,
+        .vec_dot_q                = ggml_vec_dot_v2_q8_0_q8_0,
+        .vec_dot_type             = GGML_TYPE_Q8_0,
+    },
+    [GGML_TYPE_Q8_1] = {
+        .dequantize_row_q         = NULL,   // TODO
+        .quantize_row_q           = quantize_row_v2_q8_1,
+        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_1_reference,
+        .quantize_row_q_dot       = quantize_row_v2_q8_1,
+        .vec_dot_q                = NULL,   // TODO
+        .vec_dot_type             = GGML_TYPE_Q8_1,
+    },
+};
+
+void ggjt_v2(void) {
+    GGML_BLCK_SIZE = ggjt_v2_blck_size;
+    GGML_TYPE_SIZE = ggjt_v2_type_size;
+    GGML_TYPE_NAME = ggjt_v2_type_name;
+    GGML_IS_QUANTIZED = ggjt_v2_is_quantized;
+    quantize_fns = ggjt_v2_quantize_fns;
+    GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk;
+}
--- a/third_party/ggml/ggjt.v2.internal.h
+++ b/third_party/ggml/ggjt.v2.internal.h
@ -0,0 +1,152 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
+#include "libc/str/str.h"
+#include "third_party/aarch64/arm_neon.h"
+#include "third_party/intel/immintrin.internal.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+// horizontally add 8 floats
+static inline float hsum_float_8(const __m256 x) {
+  __m128 res = _mm256_extractf128_ps(x, 1);
+  res = _mm_add_ps(res, _mm256_castps256_ps128(x));
+  res = _mm_add_ps(res, _mm_movehl_ps(res, res));
+  res = _mm_add_ss(res, _mm_movehdup_ps(res));
+  return _mm_cvtss_f32(res);
+}
+#endif
+
+#if __AVX2__ || __AVX512F__
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t *x) {
+  uint32_t x32;
+  memcpy(&x32, x, sizeof(uint32_t));
+  const __m256i shuf_mask =
+      _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
+                        0x0101010101010101, 0x0000000000000000);
+  __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
+  const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
+  bytes = _mm256_or_si256(bytes, bit_mask);
+  return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m256i x) {
+  const __m256i ones = _mm256_set1_epi16(1);
+  const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
+  return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax,
+                                             const __m256i sy) {
+#if __AVXVNNI__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Perform multiplication and create 16-bit values
+  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
+  return sum_i16_pairs_float(dot);
+#endif
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+#if __AVXVNNIINT8__
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
+  return _mm256_cvtepi32_ps(summed_pairs);
+#else
+  // Get absolute values of x vectors
+  const __m256i ax = _mm256_sign_epi8(x, x);
+  // Sign the values of the y vectors
+  const __m256i sy = _mm256_sign_epi8(y, x);
+  return mul_sum_us8_pairs_float(ax, sy);
+#endif
+}
+
+static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) {
+  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+  const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
+  const __m256i lowMask = _mm256_set1_epi8(0xF);
+  return _mm256_and_si256(lowMask, bytes);
+}
+
+#elif defined(__AVX__)
+
+// spread 32 bits to 32 bytes { 0x00, 0xFF }
+static inline __m256i bytes_from_bits_32(const uint8_t *x) {
+  uint32_t x32;
+  memcpy(&x32, x, sizeof(uint32_t));
+  const __m128i shuf_maskl =
+      _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+  const __m128i shuf_maskh =
+      _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
+  __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
+  __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
+  const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
+  bytesl = _mm_or_si128(bytesl, bit_mask);
+  bytesh = _mm_or_si128(bytesh, bit_mask);
+  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
+  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
+  return _mm256_set_m128i(bytesh, bytesl);
+}
+
+// add int16_t pairwise and return as float vector
+static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
+  const __m128i ones = _mm_set1_epi16(1);
+  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
+  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
+  const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
+  return _mm256_cvtepi32_ps(summed_pairs);
+}
+
+static inline __m256 mul_sum_us8_pairs_float(const __m256i ax,
+                                             const __m256i sy) {
+  const __m128i axl = _mm256_castsi256_si128(ax);
+  const __m128i axh = _mm256_extractf128_si256(ax, 1);
+  const __m128i syl = _mm256_castsi256_si128(sy);
+  const __m128i syh = _mm256_extractf128_si256(sy, 1);
+  // Perform multiplication and create 16-bit values
+  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+  const __m128i doth = _mm_maddubs_epi16(axh, syh);
+  return sum_i16_pairs_float(doth, dotl);
+}
+
+// multiply int8_t, add results pairwise twice and return as float vector
+static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
+  const __m128i xl = _mm256_castsi256_si128(x);
+  const __m128i xh = _mm256_extractf128_si256(x, 1);
+  const __m128i yl = _mm256_castsi256_si128(y);
+  const __m128i yh = _mm256_extractf128_si256(y, 1);
+  // Get absolute values of x vectors
+  const __m128i axl = _mm_sign_epi8(xl, xl);
+  const __m128i axh = _mm_sign_epi8(xh, xh);
+  // Sign the values of the y vectors
+  const __m128i syl = _mm_sign_epi8(yl, xl);
+  const __m128i syh = _mm_sign_epi8(yh, xh);
+  // Perform multiplication and create 16-bit values
+  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
+  const __m128i doth = _mm_maddubs_epi16(axh, syh);
+  return sum_i16_pairs_float(doth, dotl);
+}
+
+// Unpack 32 4-bit fields into 32 bytes
+// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
+static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) {
+  // Load 16 bytes from memory
+  __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
+  __m128i tmph = _mm_srli_epi16(tmpl, 4);
+  const __m128i lowMask = _mm_set1_epi8(0xF);
+  tmpl = _mm_and_si128(lowMask, tmpl);
+  tmph = _mm_and_si128(lowMask, tmph);
+  return _mm256_set_m128i(tmph, tmpl);
+}
+
+#endif /* AVX */
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_ */
--- a/third_party/ggml/ggjt.v2.q4_0.c
+++ b/third_party/ggml/ggjt.v2.q4_0.c
@ -0,0 +1,396 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q4_0.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+#include "third_party/ggml/ggjt.v2.q8_0.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q4_0) == sizeof(float) + V2_QK4_0 / 2,
+              "wrong q4_0 block size/padding");
+
+void dequantize_row_v2_q4_0(const void * restrict x_, float * restrict y, int k) {
+    const block_v2_q4_0 * restrict x = x_;
+    static const int qk = V2_QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = x[i].d;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F) - 8;
+            const int x1 = (x[i].qs[j] >>   4) - 8;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+size_t ggml_quantize_v2_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % V2_QK4_0 == 0);
+    const int nb = k / V2_QK4_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_v2_q4_0 * restrict y = (block_v2_q4_0 *) dst + b/V2_QK4_0;
+
+        quantize_row_v2_q4_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < V2_QK4_0; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/V2_QK4_0*sizeof(block_v2_q4_0));
+}
+
+void quantize_row_v2_q4_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_v2_q4_0_reference(x, y, k);
+}
+
+// reference implementation for deterministic creation of model files
+void quantize_row_v2_q4_0_reference(const float * restrict x, block_v2_q4_0 * restrict y, int k) {
+    static const int qk = V2_QK4_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -8;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
+
+void ggml_vec_dot_v2_q4_0_q8_0(const int n,
+                               float * restrict s,
+                               const void * restrict vx,
+                               const void * restrict vy) {
+    const int qk = V2_QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nb % 2 == 0);
+
+    const block_v2_q4_0 * restrict x = vx;
+    const block_v2_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_v2_q4_0 * restrict x0 = &x[i + 0];
+        const block_v2_q4_0 * restrict x1 = &x[i + 1];
+        const block_v2_q8_0 * restrict y0 = &y[i + 0];
+        const block_v2_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
+        const int8x16_t  s8b   = vdupq_n_s8(0x8);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // sub 8
+        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
+        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
+        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
+        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
+        const __m256i off = _mm256_set1_epi8( 8 );
+        bx = _mm256_sub_epi8( bx, off );
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps( d, q, acc );
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+
+        const __m128i lowMask = _mm_set1_epi8(0xF);
+        const __m128i off = _mm_set1_epi8(8);
+
+        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx = _mm_and_si128(lowMask, tmp);
+        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
+
+        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
+        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx = _mm_sub_epi8(bx, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
+
+        // Apply the scale, and accumulate
+        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__SSSE3__)
+    // set constants
+    const __m128i lowMask = _mm_set1_epi8(0xF);
+    const __m128i off = _mm_set1_epi8(8);
+
+    // Initialize accumulator with zeros
+    __m128 acc_0 = _mm_setzero_ps();
+    __m128 acc_1 = _mm_setzero_ps();
+    __m128 acc_2 = _mm_setzero_ps();
+    __m128 acc_3 = _mm_setzero_ps();
+
+    // First round without accumulation
+    {
+        _mm_prefetch(&x[0] + sizeof(block_v2_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[0] + sizeof(block_v2_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[1] + sizeof(block_v2_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[1] + sizeof(block_v2_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        acc_0 = _mm_mul_ps( d_0_1, p0 );
+        acc_1 = _mm_mul_ps( d_0_1, p1 );
+        acc_2 = _mm_mul_ps( d_2_3, p2 );
+        acc_3 = _mm_mul_ps( d_2_3, p3 );
+    }
+
+    // Main loop
+    for (int i = 2; i < nb; i+=2) {
+        _mm_prefetch(&x[i] + sizeof(block_v2_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + sizeof(block_v2_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 0 and 1
+        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
+
+        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
+
+        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
+        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
+        bx_0 = _mm_sub_epi8(bx_0, off);
+        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
+
+        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
+        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
+        bx_1 = _mm_sub_epi8(bx_1, off);
+        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
+
+        _mm_prefetch(&x[i] + 2 * sizeof(block_v2_q4_0), _MM_HINT_T0);
+        _mm_prefetch(&y[i] + 2 * sizeof(block_v2_q8_0), _MM_HINT_T0);
+
+        // Compute combined scale for the block 2 and 3
+        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
+
+        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
+
+        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
+        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
+        bx_2 = _mm_sub_epi8(bx_2, off);
+        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
+
+        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
+        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
+        bx_3 = _mm_sub_epi8(bx_3, off);
+        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
+
+        // Convert int32_t to float
+        __m128 p0 = _mm_cvtepi32_ps(i32_0);
+        __m128 p1 = _mm_cvtepi32_ps(i32_1);
+        __m128 p2 = _mm_cvtepi32_ps(i32_2);
+        __m128 p3 = _mm_cvtepi32_ps(i32_3);
+
+        // Apply the scale
+        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
+        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
+        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
+        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
+
+        // Acummulate
+        acc_0 = _mm_add_ps(p0_d, acc_0);
+        acc_1 = _mm_add_ps(p1_d, acc_1);
+        acc_2 = _mm_add_ps(p2_d, acc_2);
+        acc_3 = _mm_add_ps(p3_d, acc_3);
+    }
+
+    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F) - 8;
+            const int v1 = (x[i].qs[j] >>   4) - 8;
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (x[i].d*y[i].d)*sumi;
+    }
+
+    *s = sumf;
+#endif
+}
--- a/third_party/ggml/ggjt.v2.q4_0.h
+++ b/third_party/ggml/ggjt.v2.q4_0.h
@ -0,0 +1,22 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK4_0 32
+typedef struct {
+  float d;                   // delta
+  uint8_t qs[V2_QK4_0 / 2];  // nibbles / quants
+} block_v2_q4_0;
+
+void dequantize_row_v2_q4_0(const void* restrict, float* restrict, int);
+size_t ggml_quantize_v2_q4_0(const float*, void*, int, int, int64_t*);
+void quantize_row_v2_q4_0(const float* restrict, void* restrict, int);
+void quantize_row_v2_q4_0_reference(const float* restrict,
+                                    block_v2_q4_0* restrict, int);
+void ggml_vec_dot_v2_q4_0_q8_0(const int, float* restrict, const void* restrict,
+                               const void* restrict);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_ */
--- a/third_party/ggml/ggjt.v2.q4_1.c
+++ b/third_party/ggml/ggjt.v2.q4_1.c
@ -0,0 +1,253 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q4_1.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+#include "third_party/ggml/ggjt.v2.q8_1.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q4_1) == 2 * sizeof(float) + V2_QK4_1 / 2,
+              "wrong q4_1 block size/padding");
+
+void dequantize_row_v2_q4_1(const void * restrict x_, float * restrict y, int k) {
+    const block_v2_q4_1 * restrict x = x_;
+    static const int qk = V2_QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = x[i].d;
+        const float m = x[i].m;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int x0 = (x[i].qs[j] & 0x0F);
+            const int x1 = (x[i].qs[j] >>   4);
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+size_t ggml_quantize_v2_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % V2_QK4_1 == 0);
+    const int nb = k / V2_QK4_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_v2_q4_1 * restrict y = (block_v2_q4_1 *) dst + b/V2_QK4_1;
+
+        quantize_row_v2_q4_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < V2_QK4_1; j += 2) {
+                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
+                const uint8_t vi1 = y[i].qs[j/2] >> 4;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/V2_QK4_1*sizeof(block_v2_q4_1));
+}
+
+void quantize_row_v2_q4_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_v2_q4_1_reference(x, y, k);
+}
+
+void ggml_vec_dot_v2_q4_1_q8_1(const int n,
+                               float * restrict s,
+                               const void * restrict vx,
+                               const void * restrict vy) {
+    const int qk = V2_QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nb % 2 == 0);
+
+    const block_v2_q4_1 * restrict x = vx;
+    const block_v2_q8_1 * restrict y = vy;
+
+    // TODO: add WASM SIMD
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs = 0;
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_v2_q4_1 * restrict x0 = &x[i + 0];
+        const block_v2_q4_1 * restrict x1 = &x[i + 1];
+        const block_v2_q8_1 * restrict y0 = &y[i + 0];
+        const block_v2_q8_1 * restrict y1 = &y[i + 1];
+
+        summs += x0->m * y0->s + x1->m * y1->s;
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        // dot product into int32x4_t
+        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float * d0 = &x[i].d;
+        const float * d1 = &y[i].d;
+
+        summs += x[i].m * y[i].s;
+
+        const __m256 d0v = _mm256_broadcast_ss( d0 );
+        const __m256 d1v = _mm256_broadcast_ss( d1 );
+
+        // Compute combined scales
+        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
+
+        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
+
+        // Accumulate d0*d1*x*y
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d0d1, xy, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[i].qs[j] & 0x0F);
+            const int v1 = (x[i].qs[j] >>   4);
+
+            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void quantize_row_v2_q4_1_reference(const float * restrict x, void * restrict y_, int k) {
+    block_v2_q4_1 * restrict y = y_;
+    const int qk = V2_QK4_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 4) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+        y[i].m = min;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
+            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
+
+            y[i].qs[j]  = xi0;
+            y[i].qs[j] |= xi1 << 4;
+        }
+    }
+}
--- a/third_party/ggml/ggjt.v2.q4_1.h
+++ b/third_party/ggml/ggjt.v2.q4_1.h
@ -0,0 +1,22 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK4_1 32
+typedef struct {
+  float d;                   // delta
+  float m;                   // min
+  uint8_t qs[V2_QK4_1 / 2];  // nibbles / quants
+} block_v2_q4_1;
+
+void dequantize_row_v2_q4_1(const void* restrict, float* restrict, int);
+size_t ggml_quantize_v2_q4_1(const float*, void*, int, int, int64_t*);
+void quantize_row_v2_q4_1(const float* restrict, void* restrict, int);
+void ggml_vec_dot_v2_q4_1_q8_1(const int, float* restrict, const void* restrict,
+                               const void* restrict);
+void quantize_row_v2_q4_1_reference(const float* restrict, void* restrict, int);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_ */
--- a/third_party/ggml/ggjt.v2.q5_0.c
+++ b/third_party/ggml/ggjt.v2.q5_0.c
@ -0,0 +1,393 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q5_0.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "libc/str/str.h"
+#include "third_party/ggml/fp16.h"
+#include "third_party/ggml/fp16.internal.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+#include "third_party/ggml/ggjt.v2.q8_0.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_0 / 2,
+              "wrong q5_0 block size/padding");
+
+void dequantize_row_v2_q5_0(const void * restrict x_, float * restrict y, int k) {
+    const block_v2_q5_0 * restrict x = x_;
+    static const int qk = V2_QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
+        }
+    }
+}
+
+void quantize_row_v2_q5_0(const float * restrict x, void * restrict y, int k) {
+    quantize_row_v2_q5_0_reference(x, y, k);
+}
+
+size_t ggml_quantize_v2_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % V2_QK5_0 == 0);
+    const int nb = k / V2_QK5_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_v2_q5_0 * restrict y = (block_v2_q5_0 *)dst + b/V2_QK5_0;
+
+        quantize_row_v2_q5_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < V2_QK5_0; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/V2_QK5_0*sizeof(block_v2_q5_0));
+}
+
+void ggml_vec_dot_v2_q5_0_q8_0(const int n,
+                               float * restrict s,
+                               const void * restrict vx,
+                               const void * restrict vy) {
+    const int qk = V2_QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nb % 2 == 0);
+    assert(qk == V2_QK5_0);
+
+    const block_v2_q5_0 * restrict x = vx;
+    const block_v2_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_v2_q5_0 * restrict x0 = &x[i];
+        const block_v2_q5_0 * restrict x1 = &x[i + 1];
+        const block_v2_q8_0 * restrict y0 = &y[i];
+        const block_v2_q8_0 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        // extract the 5th bit via lookup table ((!b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+        const float x1d = GGML_FP16_TO_FP32(x1->d);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_v2_q5_0 * restrict x0 = &x[i];
+        const block_v2_q8_0 * restrict y0 = &y[i];
+
+        const v128_t m4b  = wasm_i8x16_splat(0x0F);
+        const v128_t s16b = wasm_i8x16_splat(0x10);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_1[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
+        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
+        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_fmadd_ps(d, q, acc);
+    }
+
+    *s = hsum_float_8(acc);
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8((char)0xF0);
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        /* Compute combined scale for the block */
+        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_andnot_si128(bxhil, mask);
+        bxhih = _mm_andnot_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        /* Multiply q with scale and accumulate */
+        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
+    }
+
+    *s = hsum_float_8(acc);
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
+            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void quantize_row_v2_q5_0_reference(const float * restrict x, block_v2_q5_0 * restrict y, int k) {
+    static const int qk = V2_QK5_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+        float max  = 0.0f;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+            if (amax < fabsf(v)) {
+                amax = fabsf(v);
+                max  = v;
+            }
+        }
+
+        const float d  = max / -16;
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
+
+            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
+            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(qh));
+    }
+}
--- a/third_party/ggml/ggjt.v2.q5_0.h
+++ b/third_party/ggml/ggjt.v2.q5_0.h
@ -0,0 +1,24 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_
+#include "third_party/ggml/fp16.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK5_0 32
+typedef struct {
+  ggml_fp16_t d;             // delta
+  uint8_t qh[4];             // 5-th bit of quants
+  uint8_t qs[V2_QK5_0 / 2];  // nibbles / quants
+} block_v2_q5_0;
+
+void dequantize_row_v2_q5_0(const void* restrict, float* restrict, int);
+void quantize_row_v2_q5_0(const float* restrict, void* restrict, int);
+size_t ggml_quantize_v2_q5_0(const float*, void*, int, int, int64_t*);
+void ggml_vec_dot_v2_q5_0_q8_0(const int, float* restrict, const void* restrict,
+                               const void* restrict);
+void quantize_row_v2_q5_0_reference(const float* restrict,
+                                    block_v2_q5_0* restrict, int);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_ */
--- a/third_party/ggml/ggjt.v2.q5_1.c
+++ b/third_party/ggml/ggjt.v2.q5_1.c
@ -0,0 +1,409 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q5_1.h"
+#include "libc/assert.h"
+#include "libc/math.h"
+#include "libc/str/str.h"
+#include "third_party/ggml/fp16.internal.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+#include "third_party/ggml/ggjt.v2.q8_1.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_1 / 2,
+              "wrong q5_1 block size/padding");
+
+void dequantize_row_v2_q5_1(const void * restrict x_, float * restrict y, int k) {
+    const block_v2_q5_1 * restrict x = x_;
+    static const int qk = V2_QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float m = GGML_FP16_TO_FP32(x[i].m);
+
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
+            const int x1 = (x[i].qs[j] >>   4) | xh_1;
+
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
+        }
+    }
+}
+
+void quantize_row_v2_q5_1(const float * restrict x, void * restrict y, int k) {
+    quantize_row_v2_q5_1_reference(x, y, k);
+}
+
+size_t ggml_quantize_v2_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % V2_QK5_1 == 0);
+    const int nb = k / V2_QK5_1;
+
+    for (int b = 0; b < n; b += k) {
+        block_v2_q5_1 * restrict y = (block_v2_q5_1 *)dst + b/V2_QK5_1;
+
+        quantize_row_v2_q5_1_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            uint32_t qh;
+            memcpy(&qh, &y[i].qh, sizeof(qh));
+
+            for (int j = 0; j < V2_QK5_1; j += 2) {
+                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+
+                // cast to 16 bins
+                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
+                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
+
+                hist[vi0]++;
+                hist[vi1]++;
+            }
+        }
+    }
+
+    return (n/V2_QK5_1*sizeof(block_v2_q5_1));
+}
+
+void ggml_vec_dot_v2_q5_1_q8_1(const int n,
+                               float * restrict s,
+                               const void * restrict vx,
+                               const void * restrict vy) {
+    const int qk = V2_QK8_1;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nb % 2 == 0);
+    assert(qk == V2_QK5_1);
+
+    const block_v2_q5_1 * restrict x = vx;
+    const block_v2_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    float summs0 = 0.0f;
+    float summs1 = 0.0f;
+
+    uint32_t qh0;
+    uint32_t qh1;
+
+    uint64_t tmp0[4];
+    uint64_t tmp1[4];
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_v2_q5_1 * restrict x0 = &x[i];
+        const block_v2_q5_1 * restrict x1 = &x[i + 1];
+        const block_v2_q8_1 * restrict y0 = &y[i];
+        const block_v2_q8_1 * restrict y1 = &y[i + 1];
+
+        const uint8x16_t m4b = vdupq_n_u8(0x0F);
+
+        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
+
+        // extract the 5th bit via lookup table ((b) << 4)
+        memcpy(&qh0, x0->qh, sizeof(qh0));
+        memcpy(&qh1, x1->qh, sizeof(qh1));
+
+        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
+        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
+        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
+        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
+
+        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
+        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
+        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
+        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
+
+        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
+        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
+        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
+        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
+
+        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
+        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
+
+        // 4-bit -> 8-bit
+        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
+        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
+        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
+        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
+
+        // add high bit
+        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
+        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
+        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
+        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
+
+        // load y
+        const int8x16_t v1_0l = vld1q_s8(y0->qs);
+        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
+        const int8x16_t v1_1l = vld1q_s8(y1->qs);
+        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+        const float x1d = GGML_FP16_TO_FP32(x1->d);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
+#else
+        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
+        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
+        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
+        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
+
+        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
+        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
+        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
+        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
+
+        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
+        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
+        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
+        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
+#elif defined(__wasm_simd128__)
+    v128_t sumv = wasm_f32x4_splat(0.0f);
+
+    float summs = 0.0f;
+
+    uint32_t qh;
+    uint64_t tmp[4];
+
+    // TODO: check if unrolling this is better
+    for (int i = 0; i < nb; ++i) {
+        const block_v2_q5_1 * restrict x0 = &x[i];
+        const block_v2_q8_1 * restrict y0 = &y[i];
+
+        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
+
+        const v128_t m4b = wasm_i8x16_splat(0x0F);
+
+        // extract the 5th bit
+        memcpy(&qh, x0->qh, sizeof(qh));
+
+        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
+        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
+        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
+        tmp[3] = table_b2b_0[(qh >> 24)       ];
+
+        const v128_t qhl = wasm_v128_load(tmp + 0);
+        const v128_t qhh = wasm_v128_load(tmp + 2);
+
+        const v128_t v0 = wasm_v128_load(x0->qs);
+
+        // 4-bit -> 8-bit
+        const v128_t v0l = wasm_v128_and (v0, m4b);
+        const v128_t v0h = wasm_u8x16_shr(v0, 4);
+
+        static bool x = true;
+
+        // add high bit
+        const v128_t v0lf = wasm_v128_or(v0l, qhl);
+        const v128_t v0hf = wasm_v128_or(v0h, qhh);
+
+        // load y
+        const v128_t v1l = wasm_v128_load(y0->qs);
+        const v128_t v1h = wasm_v128_load(y0->qs + 16);
+
+        // int8x16 -> int16x8
+        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
+        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
+        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
+        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
+
+        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
+        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
+        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
+        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
+
+        const float x0d = GGML_FP16_TO_FP32(x0->d);
+
+        // dot product
+        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
+                        wasm_i32x4_add(
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
+                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
+                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
+                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
+    }
+
+    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
+         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
+#elif defined(__AVX2__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
+        bx = _mm256_or_si256(bx, bxhi);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#elif defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    __m128i mask = _mm_set1_epi8(0x10);
+
+    float summs = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; i++) {
+        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+
+        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
+        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
+        __m128i bxhil = _mm256_castsi256_si128(bxhi);
+        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
+        bxhil = _mm_and_si128(bxhil, mask);
+        bxhih = _mm_and_si128(bxhih, mask);
+        __m128i bxl = _mm256_castsi256_si128(bx);
+        __m128i bxh = _mm256_extractf128_si256(bx, 1);
+        bxl = _mm_or_si128(bxl, bxhil);
+        bxh = _mm_or_si128(bxh, bxhih);
+        bx = _mm256_set_m128i(bxh, bxl);
+
+        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
+        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_us8_pairs_float(bx, by);
+
+        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
+    }
+
+    *s = hsum_float_8(acc) + summs;
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        uint32_t qh;
+        memcpy(&qh, x[i].qh, sizeof(qh));
+
+        int sumi = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
+            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
+
+            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+        }
+
+        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void quantize_row_v2_q5_1_reference(const float * restrict x, block_v2_q5_1 * restrict y, int k) {
+    const int qk = V2_QK5_1;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    for (int i = 0; i < nb; i++) {
+        float min = FLT_MAX;
+        float max = -FLT_MAX;
+
+        for (int j = 0; j < qk; j++) {
+            const float v = x[i*qk + j];
+
+            if (v < min) min = v;
+            if (v > max) max = v;
+        }
+
+        const float d  = (max - min) / ((1 << 5) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].m = GGML_FP32_TO_FP16(min);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
+
+            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
+            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
+
+            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
+        }
+
+        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
+    }
+}
--- a/third_party/ggml/ggjt.v2.q5_1.h
+++ b/third_party/ggml/ggjt.v2.q5_1.h
@ -0,0 +1,25 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_
+#include "third_party/ggml/fp16.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK5_1 32
+typedef struct {
+  ggml_fp16_t d;             // delta
+  ggml_fp16_t m;             // min
+  uint8_t qh[4];             // 5-th bit of quants
+  uint8_t qs[V2_QK5_1 / 2];  // nibbles / quants
+} block_v2_q5_1;
+
+void dequantize_row_v2_q5_1(const void* restrict, float* restrict, int);
+void quantize_row_v2_q5_1(const float* restrict, void* restrict, int);
+size_t ggml_quantize_v2_q5_1(const float*, void*, int, int, int64_t*);
+void ggml_vec_dot_v2_q5_1_q8_1(const int, float* restrict, const void* restrict,
+                               const void* restrict);
+void quantize_row_v2_q5_1_reference(const float* restrict,
+                                    block_v2_q5_1* restrict, int);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_ */
--- a/third_party/ggml/ggjt.v2.q8_0.c
+++ b/third_party/ggml/ggjt.v2.q8_0.c
@ -0,0 +1,332 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q8_0.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q8_0) == sizeof(float) + V2_QK8_0,
+              "wrong q8_0 block size/padding");
+
+void dequantize_row_v2_q8_0(const void * restrict vx, float * restrict y, int k) {
+    static const int qk = V2_QK8_0;
+
+    assert(k % qk == 0);
+
+    const int nb = k / qk;
+
+    const block_v2_q8_0 * restrict x = vx;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = x[i].d;
+
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
+        }
+    }
+}
+
+void quantize_row_v2_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(V2_QK8_0 == 32);
+    assert(k % V2_QK8_0 == 0);
+    const int nb = k / V2_QK8_0;
+
+    block_v2_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    // scalar
+    quantize_row_q8_0_reference(x, y, k);
+#endif
+}
+
+size_t ggml_quantize_v2_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+    assert(k % V2_QK8_0 == 0);
+    const int nb = k / V2_QK8_0;
+
+    for (int b = 0; b < n; b += k) {
+        block_v2_q8_0 * restrict y = (block_v2_q8_0 *)dst + b/V2_QK8_0;
+
+        quantize_row_v2_q8_0_reference(src + b, y, k);
+
+        for (int i = 0; i < nb; i++) {
+            for (int j = 0; j < V2_QK8_0; ++j) {
+                const int8_t vi = y[i].qs[j];
+
+                hist[vi/16 + 8]++;
+            }
+        }
+    }
+
+    return (n/V2_QK8_0*sizeof(block_v2_q8_0));
+}
+
+void ggml_vec_dot_v2_q8_0_q8_0(const int n,
+                               float * restrict s,
+                               const void * restrict vx,
+                               const void * restrict vy) {
+    const int qk = V2_QK8_0;
+    const int nb = n / qk;
+
+    assert(n % qk == 0);
+    assert(nb % 2 == 0);
+
+    const block_v2_q8_0 * restrict x = vx;
+    const block_v2_q8_0 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    float32x4_t sumv0 = vdupq_n_f32(0.0f);
+    float32x4_t sumv1 = vdupq_n_f32(0.0f);
+
+    for (int i = 0; i < nb; i += 2) {
+        const block_v2_q8_0 * restrict x0 = &x[i + 0];
+        const block_v2_q8_0 * restrict x1 = &x[i + 1];
+        const block_v2_q8_0 * restrict y0 = &y[i + 0];
+        const block_v2_q8_0 * restrict y1 = &y[i + 1];
+
+        const int8x16_t x0_0 = vld1q_s8(x0->qs);
+        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
+        const int8x16_t x1_0 = vld1q_s8(x1->qs);
+        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
+
+        // load y
+        const int8x16_t y0_0 = vld1q_s8(y0->qs);
+        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
+        const int8x16_t y1_0 = vld1q_s8(y1->qs);
+        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
+
+#if defined(__ARM_FEATURE_DOTPROD)
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
+
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
+                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
+
+#else
+        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
+        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
+        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
+        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
+
+        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
+        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
+        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
+        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
+
+        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
+        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
+        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
+        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
+
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
+#endif
+    }
+
+    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
+#elif defined(__AVX2__) || defined(__AVX__)
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        // Compute combined scale for the block
+        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
+        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
+        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
+
+        const __m256 q = mul_sum_i8_pairs_float(bx, by);
+
+        // Multiply q with scale and accumulate
+#if defined(__AVX2__)
+        acc = _mm256_fmadd_ps( d, q, acc );
+#else
+        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
+#endif
+    }
+
+    *s = hsum_float_8(acc);
+#else
+    // scalar
+    float sumf = 0.0;
+
+    for (int i = 0; i < nb; i++) {
+        int sumi = 0;
+
+        for (int j = 0; j < qk; j++) {
+            sumi += x[i].qs[j]*y[i].qs[j];
+        }
+
+        sumf += (x[i].d*y[i].d)*sumi;
+    }
+
+    *s = sumf;
+#endif
+}
+
+void quantize_row_v2_q8_0_reference(const float * restrict x, block_v2_q8_0 * restrict y, int k) {
+    assert(k % V2_QK8_0 == 0);
+    const int nb = k / V2_QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < V2_QK8_0; j++) {
+            const float v = x[i*V2_QK8_0 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int j = 0; j < V2_QK8_0; ++j) {
+            const float x0 = x[i*V2_QK8_0 + j]*id;
+
+            y[i].qs[j] = roundf(x0);
+        }
+    }
+}
--- a/third_party/ggml/ggjt.v2.q8_0.h
+++ b/third_party/ggml/ggjt.v2.q8_0.h
@ -0,0 +1,22 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK8_0 32
+typedef struct {
+  float d;              // delta
+  int8_t qs[V2_QK8_0];  // quants
+} block_v2_q8_0;
+
+void dequantize_row_v2_q8_0(const void* restrict, float* restrict, int);
+void quantize_row_v2_q8_0(const float* restrict, void* restrict, int);
+size_t ggml_quantize_v2_q8_0(const float*, void*, int, int, int64_t*);
+void ggml_vec_dot_v2_q8_0_q8_0(const int, float* restrict, const void* restrict,
+                               const void* restrict);
+void quantize_row_v2_q8_0_reference(const float* restrict,
+                                    block_v2_q8_0* restrict, int);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_ */
--- a/third_party/ggml/ggjt.v2.q8_1.c
+++ b/third_party/ggml/ggjt.v2.q8_1.c
@ -0,0 +1,221 @@
+/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
+│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  GGML                                                                        │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggjt.v2.q8_1.h"
+#include "libc/assert.h"
+#include "libc/macros.internal.h"
+#include "libc/math.h"
+#include "third_party/ggml/ggjt.v2.internal.h"
+// clang-format off
+
+static_assert(sizeof(block_v2_q8_1) == 2*sizeof(float) + V2_QK8_1,
+              "wrong q8_1 block size/padding");
+
+#if __AVX__ || __AVX2__ || __AVX512F__
+static inline int hsum_i32_8(const __m256i a) {
+    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
+    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
+    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
+    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
+}
+#endif
+
+void quantize_row_v2_q8_1(const float * restrict x, void * restrict vy, int k) {
+    assert(k % V2_QK8_1 == 0);
+    const int nb = k / V2_QK8_1;
+
+    block_v2_q8_1 * restrict y = vy;
+
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
+        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
+
+        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
+        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
+        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int32x4_t accv = vdupq_n_s32(0);
+
+        for (int j = 0; j < 8; j++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
+
+            accv = vaddq_s32(accv, vi);
+        }
+
+        y[i].s = d * vaddvq_s32(accv);
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Compute the sum of the quants and set y[i].s
+        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
+
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Compute the sum of the quants and set y[i].s
+        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
+        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
+        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    // scalar
+    quantize_row_q8_1_reference(x, y, k);
+#endif
+}
+
+void quantize_row_v2_q8_1_reference(const float * restrict x, block_v2_q8_1 * restrict y, int k) {
+    assert(V2_QK8_1 == 32);
+    assert(k % V2_QK8_1 == 0);
+    const int nb = k / V2_QK8_1;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int j = 0; j < V2_QK8_1; j++) {
+            const float v = x[i*V2_QK8_1 + j];
+            amax = MAX(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        int sum = 0;
+
+        for (int j = 0; j < V2_QK8_1/2; ++j) {
+            const float v0 = x[i*V2_QK8_1           + j]*id;
+            const float v1 = x[i*V2_QK8_1 + V2_QK8_1/2 + j]*id;
+
+            y[i].qs[          j] = roundf(v0);
+            y[i].qs[V2_QK8_1/2 + j] = roundf(v1);
+
+            sum += y[i].qs[          j];
+            sum += y[i].qs[V2_QK8_1/2 + j];
+        }
+
+        y[i].s = d * sum;
+    }
+}
--- a/third_party/ggml/ggjt.v2.q8_1.h
+++ b/third_party/ggml/ggjt.v2.q8_1.h
@ -0,0 +1,19 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_
+#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+COSMOPOLITAN_C_START_
+
+#define V2_QK8_1 32
+typedef struct {
+  float d;              // delta
+  float s;              // d * sum(qs[i])
+  int8_t qs[V2_QK8_1];  // quants
+} block_v2_q8_1;
+
+void quantize_row_v2_q8_1(const float* restrict, void* restrict, int);
+void quantize_row_v2_q8_1_reference(const float* restrict,
+                                    block_v2_q8_1* restrict, int);
+
+COSMOPOLITAN_C_END_
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_ */
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
--- a/third_party/ggml/ggml.h
+++ b/third_party/ggml/ggml.h
@ -191,11 +191,15 @@ COSMOPOLITAN_C_START_
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1

+#define GGML_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
+#define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4

 #define GGML_ASSERT(x) \
@ -229,6 +233,7 @@ COSMOPOLITAN_C_START_
    enum ggml_backend {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_CL = 2,
    };

    // model file types
@ -294,6 +299,7 @@ COSMOPOLITAN_C_START_
        GGML_OP_ROPE,
        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
+        GGML_OP_CLAMP,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,

@ -351,11 +357,13 @@ COSMOPOLITAN_C_START_

        void * data;

-        char name[32];
+        char name[GGML_MAX_NAME];

-        char padding[16]; // TODO: remove and add padding to name?
+        char padding[16];
    };

+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@ -393,6 +401,8 @@ COSMOPOLITAN_C_START_
    // compatibilty
    //

+    // TODO(jart): These should be associated with each tensor.
+    GGML_API void ggjt_v3(void);
    GGML_API void ggjt_v2(void);
    GGML_API void ggjt_v1(void);

@ -415,6 +425,7 @@ COSMOPOLITAN_C_START_
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);

    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

@ -423,6 +434,9 @@ COSMOPOLITAN_C_START_
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
    // main

    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
@ -430,7 +444,11 @@ COSMOPOLITAN_C_START_

    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);

-    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+    GGML_API void *  ggml_get_mem_buffer(struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size  (struct ggml_context * ctx);

    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
@ -470,6 +488,8 @@ COSMOPOLITAN_C_START_
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@ -730,8 +750,6 @@ COSMOPOLITAN_C_START_
            struct ggml_tensor  * a,
            int64_t               ne0);

-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -839,7 +857,7 @@ COSMOPOLITAN_C_START_
            int                   n_past);

    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);
@ -854,7 +872,6 @@ COSMOPOLITAN_C_START_
            struct ggml_tensor  * a);

    // rotary position embedding
-    // in-place, returns view(a)
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
@ -888,7 +905,16 @@ COSMOPOLITAN_C_START_
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
-            int                   n_head);
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    struct ggml_tensor * ggml_clamp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 min,
+            float                 max);

    // padding = 1
    // TODO: we don't support extra parameters for now
@ -950,6 +976,11 @@ COSMOPOLITAN_C_START_
    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);

+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

--- a/third_party/ggml/ggml.mk
+++ b/third_party/ggml/ggml.mk
@ -24,8 +24,15 @@ THIRD_PARTY_GGML_A_HDRS =						\
 	third_party/ggml/ggjt.v1.q5_1.h					\
 	third_party/ggml/ggjt.v1.q8_0.h					\
 	third_party/ggml/ggjt.v1.q8_1.h					\
+	third_party/ggml/ggjt.v2.q4_0.h					\
+	third_party/ggml/ggjt.v2.q4_1.h					\
+	third_party/ggml/ggjt.v2.q5_0.h					\
+	third_party/ggml/ggjt.v2.q5_1.h					\
+	third_party/ggml/ggjt.v2.q8_0.h					\
+	third_party/ggml/ggjt.v2.q8_1.h					\
 	third_party/ggml/fp16.internal.h				\
-	third_party/ggml/ggjt.v1.internal.h
+	third_party/ggml/ggjt.v1.internal.h				\
+	third_party/ggml/ggjt.v2.internal.h

 THIRD_PARTY_GGML_A_SRCS =						\
 	third_party/ggml/fp16.c						\
@ -37,11 +44,19 @@ THIRD_PARTY_GGML_A_SRCS =						\
 	third_party/ggml/ggjt.v1.q5_0.c					\
 	third_party/ggml/ggjt.v1.q5_1.c					\
 	third_party/ggml/ggjt.v1.q8_0.c					\
-	third_party/ggml/ggjt.v1.q8_1.c
+	third_party/ggml/ggjt.v1.q8_1.c					\
+	third_party/ggml/ggjt.v2.c					\
+	third_party/ggml/ggjt.v2.q4_0.c					\
+	third_party/ggml/ggjt.v2.q4_1.c					\
+	third_party/ggml/ggjt.v2.q5_0.c					\
+	third_party/ggml/ggjt.v2.q5_1.c					\
+	third_party/ggml/ggjt.v2.q8_0.c					\
+	third_party/ggml/ggjt.v2.q8_1.c

 THIRD_PARTY_GGML_A_DIRECTDEPS =						\
 	LIBC_CALLS							\
 	LIBC_INTRIN							\
+	LIBC_FMT							\
 	LIBC_MEM							\
 	LIBC_NEXGEN32E							\
 	LIBC_RUNTIME							\
--- a/third_party/ggml/llama.cc
+++ b/third_party/ggml/llama.cc
@ -68,6 +68,7 @@ asm(".include \"libc/disclaimer.inc\"");
 // available llama models
 enum e_model {
    MODEL_UNKNOWN,
+    MODEL_3B,
    MODEL_7B,
    MODEL_13B,
    MODEL_30B,
@ -83,6 +84,7 @@ static const size_t MB = 1024*1024;
 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
+        { MODEL_3B,    128ull * MB },
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
@ -94,6 +96,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
 {
    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
+        { MODEL_3B,    128ull * MB },
        { MODEL_7B,    512ull * MB },
        { MODEL_13B,   512ull * MB },
        { MODEL_30B,   512ull * MB },
@ -106,6 +109,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
 static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 {
    static std::map<e_model, size_t> k_sizes = {
+        { MODEL_3B,    682ull * MB },
        { MODEL_7B,   1026ull * MB },
        { MODEL_13B,  1608ull * MB },
        { MODEL_30B,  3124ull * MB },
@ -119,6 +123,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
 static const std::map<e_model, size_t> & MEM_REQ_EVAL()
 {
    static std::map<e_model, size_t> k_sizes = {
+        { MODEL_3B,   512ull * MB },
        { MODEL_7B,   768ull * MB },
        { MODEL_13B, 1024ull * MB },
        { MODEL_30B, 1280ull * MB },
@ -436,6 +441,7 @@ enum llama_file_version {
    LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
    LLAMA_FILE_VERSION_GGJT_V1, // adopted unified aligned mappable layout
    LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+    LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
 };

 struct llama_file_loader {
@ -457,22 +463,25 @@ struct llama_file_loader {
        uint32_t magic = file.read_u32();
        uint32_t version = 0;

-        if (magic != READ32BE("ggml")) {
+        if (magic != LLAMA_FILE_MAGIC_GGML) {
            version = file.read_u32();
        }

-        if (magic == READ32BE("ggml") && version == 0) {
+        if (magic == LLAMA_FILE_MAGIC_GGML && version == 0) {
            file_version = LLAMA_FILE_VERSION_GGML;
            ggjt_v1();
-        } else if (magic == READ32BE("ggmf") && version == 1) {
+        } else if (magic == LLAMA_FILE_MAGIC_GGMF && version == 1) {
            file_version = LLAMA_FILE_VERSION_GGMF_V1;
            ggjt_v1();
-        } else if (magic == READ32BE("ggjt") && version == 1) {
+        } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 1) {
            file_version = LLAMA_FILE_VERSION_GGJT_V1;
            ggjt_v1();
-        } else if (magic == READ32BE("ggjt") && version == 2) {
+        } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 2) {
            file_version = LLAMA_FILE_VERSION_GGJT_V2;
            ggjt_v2();
+        } else if (magic == LLAMA_FILE_MAGIC_GGJT && version == 3) {
+            file_version = LLAMA_FILE_VERSION_GGJT_V3;
+            ggjt_v3();
        } else {
            Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                magic, version);
@ -570,9 +579,9 @@ struct llama_file_saver {
        write_vocab();
    }
    void write_magic() {
-        ggjt_v2();
-        file.write_u32(READ32BE("ggjt")); // magic
-        file.write_u32(2); // version
+        ggjt_v3();
+        file.write_u32(LLAMA_FILE_MAGIC);   // magic
+        file.write_u32(LLAMA_FILE_VERSION); // version
    }
    void write_hparams(enum llama_ftype new_ftype) {
        const llama_hparams & hparams = any_file_loader->hparams;
@ -614,7 +623,7 @@ struct llama_file_saver {
        file.write_u32(new_type);
        file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
        file.write_raw(tensor.name.data(), tensor.name.size());
-        file.seek(-file.tell() & 31, SEEK_CUR);
+        file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
        LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
        file.write_raw(new_data, new_size);
    }
@ -629,12 +638,12 @@ struct llama_model_loader {
    std::unique_ptr<llama_mmap> mapping;

    llama_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
+        auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
        file_loaders.emplace_back(first_file);
        uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
        for (uint32_t i = 1; i < n_parts; i++) {
            std::string fname = fname_base + "." + std::to_string(i);
-            auto ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
+            auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
            file_loaders.emplace_back(ith_file);
            if (ith_file->hparams != first_file->hparams) {
                Die("llama.cpp: hparams inconsistent between files");
@ -681,7 +690,7 @@ struct llama_model_loader {
        }
    }

-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            Die("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -692,10 +701,10 @@ struct llama_model_loader {
                name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }

-        return get_tensor_for(lt);
+        return get_tensor_for(lt, backend);
    }

-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
+    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
        struct ggml_tensor * tensor;
        if (lt.ne.size() == 2) {
            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -705,12 +714,13 @@ struct llama_model_loader {
        }
        ggml_set_name(tensor, lt.name.c_str());
        LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+        tensor->backend = backend;
        lt.ggml_tensor = tensor;
        num_ggml_tensors_created++;
        return tensor;
    }

-    void done_getting_tensors() {
+    void done_getting_tensors() const {
        if (num_ggml_tensors_created != tensors_map.tensors.size()) {
            Die("llama.cpp: file contained more tensors than expected");
        }
@ -718,12 +728,16 @@ struct llama_model_loader {

    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
        size_t data_size = 0;
+        size_t prefetch_size = 0;
        for (const llama_load_tensor & lt : tensors_map.tensors) {
            data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                prefetch_size += lt.size;
+            }
        }

        if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
+            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
            if (!lmlock) {
                // Don't call the callback since the actual loading will be lazy
                // and we can't measure it.
@ -736,6 +750,9 @@ struct llama_model_loader {

        size_t done_size = 0;
        for (llama_load_tensor & lt : tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+                continue;
+            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
@ -853,8 +870,9 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.n_ctx                       =*/ 512,
        /*.n_parts                     =*/ -1,
+        /*.gpu_layers                  =*/ 0,
        /*.seed                        =*/ -1,
-        /*.f16_kv                      =*/ false,
+        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
@ -875,6 +893,21 @@ bool llama_mlock_supported() {
    return llama_mlock::SUPPORTED;
 }

+void llama_init_backend() {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+int64_t llama_time_us() {
+    return ggml_time_us();
+}
+
 //
 // model loading
 //
@ -884,7 +917,8 @@ static const char *llama_file_version_name(llama_file_version version) {
        case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
        case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (pre #613 with sharded files and no mmap support)";
        case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
-        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
+        case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
        default: LLAMA_ASSERT(false);
    }
 }
@ -907,6 +941,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {

 static const char *llama_model_type_name(e_model type) {
    switch (type) {
+        case MODEL_3B: return "3B";
        case MODEL_7B: return "7B";
        case MODEL_13B: return "13B";
        case MODEL_30B: return "30B";
@ -919,6 +954,7 @@ static void llama_model_load_internal(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -940,6 +976,7 @@ static void llama_model_load_internal(

    {
        switch (hparams.n_layer) {
+            case 26: model.type = e_model::MODEL_3B; break;
            case 32: model.type = e_model::MODEL_7B; break;
            case 40: model.type = e_model::MODEL_13B; break;
            case 60: model.type = e_model::MODEL_30B; break;
@ -986,29 +1023,7 @@ static void llama_model_load_internal(
    size_t ctx_size, mmapped_size;
    ml->calc_sizes(&ctx_size, &mmapped_size);
    if (verbose > 0) {
-        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
-    }
-
-    // print memory requirements
-    {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-            ctx_size +
-            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
-
-        // this is the memory required by one llama_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
-
-        if (verbose > 0) {
-            fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
-                    mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
-        }
+        fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
    }

    // create the ggml context
@ -1031,7 +1046,14 @@ static void llama_model_load_internal(
        }
    }

+#ifdef GGML_USE_CUBLAS
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
+#else
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#endif
+
    // prepare memory for the weights
+    size_t vram_total = 0;
    {
        const auto & hparams = model.hparams;

@ -1041,33 +1063,87 @@ static void llama_model_load_internal(

        ml->ggml_ctx = ctx;

-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
-        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
+
+        // "output" tensor
+        {
+            ggml_backend backend_output;
+            if (n_gpu_layers > int(n_layer)) { // NOLINT
+                backend_output = LLAMA_BACKEND_OFFLOAD;
+            } else {
+                backend_output = GGML_BACKEND_CPU;
+            }
+
+            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
+        }
+
+        const int i_gpu_start = n_layer - n_gpu_layers;

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
+            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+
            auto & layer = model.layers[i];

            std::string layers_i = "layers." + std::to_string(i);

-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
+            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);

-            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
-            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
-            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
-            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
+            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
+            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
+            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);

-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
+            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);

-            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
+            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
+            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   backend);
+
+            if (backend == GGML_BACKEND_CUDA) {
+                vram_total +=
+                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
+                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+            }
        }
    }

    ml->done_getting_tensors();

+    // print memory requirements
+    {
+        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
+
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+            ctx_size +
+            mmapped_size - vram_total + // weights in VRAM not in memory
+            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
+            MEM_REQ_EVAL().at(model.type);
+
+        // this is the memory required by one llama_state
+        const size_t mem_required_state =
+            scale*MEM_REQ_KV_SELF().at(model.type);
+
+        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
+
+#ifdef GGML_USE_CUBLAS
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+        }
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+#elif !defined(GGML_USE_CLBLAST)
+        (void) n_gpu_layers;
+#endif
+    }
+
    // populate `tensors_by_name`
    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
@ -1075,6 +1151,59 @@ static void llama_model_load_internal(

    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

+#ifdef GGML_USE_CUBLAS
+    {
+        size_t done_size = 0;
+        size_t data_size = 0;
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                done_size += lt.size;
+            }
+        }
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+                continue;
+            }
+            if (progress_callback) {
+                progress_callback((float) done_size / data_size, progress_callback_user_data);
+            }
+            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
+            done_size += lt.size;
+        }
+    }
+#elif defined(GGML_USE_CLBLAST)
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "ggml_opencl: offloading %d layers to GPU\n", n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "ggml_opencl: offloading output layer to GPU\n");
+            ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "ggml_opencl: total VRAM used: %zu MB\n", vram_total / 1024 / 1024);
+    }
+#endif
+
+    if (progress_callback) {
+        progress_callback(1.0f, progress_callback_user_data);
+    }
+
    model.mapping = std::move(ml->mapping);

    // loading time will be recalculate after the first eval, so
@ -1086,6 +1215,7 @@ static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1094,7 +1224,7 @@ static bool llama_model_load(
        void *progress_callback_user_data,
        int verbose) {
    // try {
-    llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+    llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
                              vocab_only, progress_callback, progress_callback_user_data,
                              verbose);
    return true;
@ -1175,10 +1305,8 @@ static bool llama_eval_internal(
        {
            cur = ggml_rms_norm(ctx0, inpL);

-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
-                        cur);
+            // cur = cur*attention_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
        }

        // self-attention
@ -1285,10 +1413,8 @@ static bool llama_eval_internal(
            {
                cur = ggml_rms_norm(ctx0, inpFF);

-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
-                        cur);
+                // cur = cur*ffn_norm(broadcasted)
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
            }

            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@ -1325,10 +1451,8 @@ static bool llama_eval_internal(

        inpL = ggml_rms_norm(ctx0, inpL);

-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.norm, inpL),
-                    inpL);
+        // inpL = inpL*norm(broadcasted)
+        inpL = ggml_mul(ctx0, inpL, model.norm);

        embeddings = inpL;
    }
@ -2149,10 +2273,11 @@ struct llama_context * llama_init_from_file(
            unsigned percentage = (unsigned) (100 * progress);
            while (percentage > *cur_percentage_p) {
                ++*cur_percentage_p;
-                fprintf(stderr, ".");
+                // TODO(jart): Fix this.
+                // fprintf(stderr, ".");
                fflush(stderr);
                if (percentage >= 100) {
-                    fprintf(stderr, "\n");
+                    // fprintf(stderr, "\n");
                }
            }
        };
@ -2163,7 +2288,7 @@ struct llama_context * llama_init_from_file(

    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
                          params.use_mmap, params.use_mlock, params.vocab_only,
                          params.progress_callback, params.progress_callback_user_data,
                          verbose)) {
@ -2242,7 +2367,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
-        if (magic != READ32BE("ggla")) {
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
            fprintf(stderr, "%s: bad file magic\n", __func__);
            return 1;
        }
@ -2306,7 +2431,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *

        // maybe this should in llama_model_loader
        if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
        }
    }

@ -2399,7 +2524,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);
                lt.ggml_tensor->data = lt.data;
@ -2453,7 +2578,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *

            n_tensors++;
            if (n_tensors % 4 == 0) {
-                fprintf(stderr, ".");
+                // fprintf(stderr, ".");
            }
        }
    }
@ -2586,9 +2711,10 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
        if (kv_size) {
            const size_t elt_size = ggml_element_size(kv_self.k);

-            char buffer[4096];
+            llama_buffer buffer; // [jart] fixed buffer alignment bug
+            buffer.resize(4096);

-            ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
+            ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true });
            ggml_cgraph gf{};
            gf.n_threads = 1;

--- a/third_party/ggml/llama.h
+++ b/third_party/ggml/llama.h
@ -20,12 +20,23 @@
 #    define LLAMA_API
 #endif

-#define LLAMA_FILE_VERSION           1
-#define LLAMA_FILE_MAGIC             READ32BE("ggjt")
-#define LLAMA_FILE_MAGIC_UNVERSIONED READ32BE("ggml")
-#define LLAMA_SESSION_MAGIC          READ32BE("ggsn")
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1

+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -55,9 +66,10 @@ extern "C" {
    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, -1 for random
+        int n_ctx;        // text context
+        int n_parts;      // -1 for default
+        int n_gpu_layers; // number of layers to store in VRAM
+        int seed;         // RNG seed, -1 for random

        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
@ -74,16 +86,16 @@ extern "C" {

    // model file types
    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_2          = 5, // except 1d tensors
        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
    };

    LLAMA_API struct llama_context_params llama_context_default_params();
@ -91,6 +103,13 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();

+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
--- a/third_party/ggml/llama_util.h
+++ b/third_party/ggml/llama_util.h
@ -387,7 +387,7 @@ struct llama_buffer {

    void resize(size_t size) {
        free(addr);
-        addr = (uint8_t *)memalign(32, size);
+        addr = (uint8_t *)memalign(32, size); // [jart] always avx align
        this->size = size;
    }

--- a/third_party/ggml/main.cc
+++ b/third_party/ggml/main.cc
@ -210,17 +210,11 @@ static int on_missing_feature(const char *name) {
    return 1;
 }

-void MakeProcessNice(void) {
-    setpriority(PRIO_PROCESS, 0, 10);
-    ioprio_set(IOPRIO_WHO_PROCESS, 0, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-    struct sched_param param = {sched_get_priority_min(SCHED_IDLE)};
-    sched_setscheduler(0, SCHED_IDLE, &param);
-}
-
 int main(int argc, char ** argv) {

    MakeProcessNice();
    ShowCrashReports();
+
    setvbuf(stdin, NULL, _IONBF, 0);
    setvbuf(stdout, NULL, _IONBF, 0);
    setvbuf(stderr, NULL, _IONBF, 0);
@ -232,9 +226,7 @@ int main(int argc, char ** argv) {
    if (!X86_HAVE(AVX)) return on_missing_feature("avx");
    if (!X86_HAVE(FMA)) return on_missing_feature("fma");
    if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
-    if (!X86_HAVE(F16C)) {
-        fprintf(stderr, "%s: warning: cpuid f16c not detected; inference might crash\n", __func__);
-    }
+    if (!X86_HAVE(F16C)) return on_missing_feature("f16c");
 #endif /* __x86_64__ */

    if (gpt_params_parse(argc, argv, params) == false) {
--- a/third_party/ggml/quantize.cc
+++ b/third_party/ggml/quantize.cc
@ -55,9 +55,10 @@ static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
 //  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads]
 //
 int main(int argc, char ** argv) {
+    MakeProcessNice();
    ShowCrashReports();

-    ggjt_v2();
+    ggjt_v3();
    ggml_time_init();

    if (argc < 3) {
@ -69,11 +70,7 @@ int main(int argc, char ** argv) {
    }

    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
+    llama_init_backend();

    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
@ -95,7 +92,7 @@ int main(int argc, char ** argv) {
        ftype = (enum llama_ftype)atoi(argv[3]);
    }

-    int nthread = argc > 4 ? atoi(argv[4]) : 0;
+    int nthread = argc > 4 ? atoi(argv[4]) : std::min(20, std::max(1, _getcpucount() >> 1));

    const int64_t t_main_start_us = ggml_time_us();

--- a/third_party/radpajama/copy-gptneox.cc
+++ b/third_party/radpajama/copy-gptneox.cc
@ -48,6 +48,7 @@ static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
 //  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
+    MakeProcessNice();
    ShowCrashReports();

    ggjt_v1();
--- a/third_party/radpajama/main-redpajama-chat.cc
+++ b/third_party/radpajama/main-redpajama-chat.cc
@ -99,6 +99,7 @@ int main(int argc, char ** argv) {
    params.instruct = true;
    params.interactive = true;

+    MakeProcessNice();
    ShowCrashReports();

    if (gpt_params_parse(argc, argv, params) == false) {  return 1; }
@ -136,6 +137,7 @@ int main(int argc, char ** argv) {
        }
    }

+    MakeProcessNice();
    ShowCrashReports();

    // Always interactive for RedPajama chat model
--- a/third_party/radpajama/main-redpajama.cc
+++ b/third_party/radpajama/main-redpajama.cc
@ -84,6 +84,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-f16.bin";

+    MakeProcessNice();
    ShowCrashReports();

    if (gpt_params_parse(argc, argv, params) == false) {
--- a/third_party/radpajama/quantize-gptneox.cc
+++ b/third_party/radpajama/quantize-gptneox.cc
@ -50,6 +50,7 @@ static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
 //  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
+    MakeProcessNice();
    ShowCrashReports();

    ggjt_v2();