Delete GGML and RadPajama

Please use https://github.com/mozilla-Ocho/llamafile which is better, newer, and built on cosmocc. If you need the RadPajama model, file an issue with llamafile asking for support.
2025-06-27 06:48:31 +00:00 · 2023-12-10 06:08:54 -08:00 · 2023-12-10 06:08:54 -08:00 · 6ee5580adc
commit 6ee5580adc
parent 4a63e4a27e
67 changed files with 0 additions and 35420 deletions
--- a/2
+++ b/2
@ -224,9 +224,7 @@ include dsp/BUILD.mk				# │
 include third_party/stb/BUILD.mk		# │
 include third_party/mbedtls/BUILD.mk		# │
 include third_party/libcxx/BUILD.mk		# │
-include third_party/ggml/BUILD.mk		# │
 include third_party/pcre/BUILD.mk		# │
-include third_party/radpajama/BUILD.mk		# │
 include net/https/BUILD.mk			# │
 include third_party/regex/BUILD.mk		#─┘
 include third_party/tidy/BUILD.mk
--- a/third_party/BUILD.mk
+++ b/third_party/BUILD.mk
@ -13,7 +13,6 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/finger		\
 	o/$(MODE)/third_party/gdtoa		\
 	o/$(MODE)/third_party/getopt		\
-	o/$(MODE)/third_party/ggml		\
 	o/$(MODE)/third_party/hiredis		\
 	o/$(MODE)/third_party/libcxx		\
 	o/$(MODE)/third_party/linenoise		\
@ -28,7 +27,6 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/puff		\
 	o/$(MODE)/third_party/python		\
 	o/$(MODE)/third_party/quickjs		\
-	o/$(MODE)/third_party/radpajama		\
 	o/$(MODE)/third_party/regex		\
 	o/$(MODE)/third_party/sed		\
 	o/$(MODE)/third_party/smallz4		\
--- a/third_party/ggml/BUILD.mk
+++ b/third_party/ggml/BUILD.mk
@ -1,207 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
-
-PKGS += THIRD_PARTY_GGML
-
-################################################################################
-# single file machine learning framework written in c
-# make -j8 o//third_party/ggml/ggml.a
-
-THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A
-THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A)
-THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a
-THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o)
-THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS)
-THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok)
-
-THIRD_PARTY_GGML_A_HDRS =						\
-	third_party/ggml/fp16.h						\
-	third_party/ggml/ggml.h						\
-	third_party/ggml/ggjt.v1.q4_0.h					\
-	third_party/ggml/ggjt.v1.q4_1.h					\
-	third_party/ggml/ggjt.v1.q4_2.h					\
-	third_party/ggml/ggjt.v1.q5_0.h					\
-	third_party/ggml/ggjt.v1.q5_1.h					\
-	third_party/ggml/ggjt.v1.q8_0.h					\
-	third_party/ggml/ggjt.v1.q8_1.h					\
-	third_party/ggml/ggjt.v2.q4_0.h					\
-	third_party/ggml/ggjt.v2.q4_1.h					\
-	third_party/ggml/ggjt.v2.q5_0.h					\
-	third_party/ggml/ggjt.v2.q5_1.h					\
-	third_party/ggml/ggjt.v2.q8_0.h					\
-	third_party/ggml/ggjt.v2.q8_1.h					\
-	third_party/ggml/fp16.internal.h				\
-	third_party/ggml/ggjt.v1.internal.h				\
-	third_party/ggml/ggjt.v2.internal.h
-
-THIRD_PARTY_GGML_A_SRCS =						\
-	third_party/ggml/fp16.c						\
-	third_party/ggml/ggml.c						\
-	third_party/ggml/ggjt.v1.c					\
-	third_party/ggml/ggjt.v1.q4_0.c					\
-	third_party/ggml/ggjt.v1.q4_1.c					\
-	third_party/ggml/ggjt.v1.q4_2.c					\
-	third_party/ggml/ggjt.v1.q5_0.c					\
-	third_party/ggml/ggjt.v1.q5_1.c					\
-	third_party/ggml/ggjt.v1.q8_0.c					\
-	third_party/ggml/ggjt.v1.q8_1.c					\
-	third_party/ggml/ggjt.v2.c					\
-	third_party/ggml/ggjt.v2.q4_0.c					\
-	third_party/ggml/ggjt.v2.q4_1.c					\
-	third_party/ggml/ggjt.v2.q5_0.c					\
-	third_party/ggml/ggjt.v2.q5_1.c					\
-	third_party/ggml/ggjt.v2.q8_0.c					\
-	third_party/ggml/ggjt.v2.q8_1.c
-
-THIRD_PARTY_GGML_A_DIRECTDEPS =						\
-	LIBC_CALLS							\
-	LIBC_INTRIN							\
-	LIBC_FMT							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_THREAD							\
-	LIBC_STR							\
-        LIBC_PROC							\
-	LIBC_SYSV							\
-	LIBC_TINYMATH							\
-	THIRD_PARTY_COMPILER_RT
-
-THIRD_PARTY_GGML_A_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x))))
-
-$(THIRD_PARTY_GGML_A):							\
-		third_party/ggml/					\
-		$(THIRD_PARTY_GGML_A).pkg				\
-		$(THIRD_PARTY_GGML_A_OBJS)
-
-$(THIRD_PARTY_GGML_A).pkg:						\
-		$(THIRD_PARTY_GGML_A_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)_A).pkg)
-
-$(THIRD_PARTY_GGML_A_OBJS): private					\
-		CFLAGS +=						\
-			-O3						\
-			-ffunction-sections				\
-			-fdata-sections
-
-ifeq ($(ARCH), x86_64)
-$(THIRD_PARTY_GGML_A_OBJS): private					\
-		CFLAGS +=						\
-			-msse3						\
-			-mavx						\
-			-mavx2						\
-			-mf16c						\
-			-mfma
-endif
-
-o/opt/third_party/ggml/ggml.o: private					\
-		CFLAGS +=						\
-			-x-no-pg
-
-################################################################################
-# command for running inference on large language models
-# make -j8 o//third_party/ggml/llama.com
-
-THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_LLAMA
-THIRD_PARTY_GGML_LLAMA = o/$(MODE)/third_party/ggml/llama.com
-THIRD_PARTY_GGML_LLAMA_OBJS = $(THIRD_PARTY_GGML_LLAMA_SRCS:%.cc=o/$(MODE)/%.o)
-THIRD_PARTY_GGML_LLAMA_FILES := $(THIRD_PARTY_GGML_LLAMA_SRCS) $(THIRD_PARTY_GGML_LLAMA_HDRS)
-THIRD_PARTY_GGML_LLAMA_CHECKS = $(THIRD_PARTY_GGML_LLAMA).pkg $(THIRD_PARTY_GGML_LLAMA_HDRS:%=o/$(MODE)/%.okk)
-
-THIRD_PARTY_GGML_LLAMA_HDRS =						\
-	third_party/ggml/common.cc					\
-	third_party/ggml/llama.h					\
-	third_party/ggml/llama_util.h					\
-	third_party/ggml/common.h
-
-THIRD_PARTY_GGML_LLAMA_SRCS =						\
-	third_party/ggml/main.cc					\
-	third_party/ggml/llama.cc					\
-	third_party/ggml/common.cc					\
-	third_party/ggml/quantize.cc					\
-	third_party/ggml/perplexity.cc
-
-THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =					\
-	LIBC_CALLS							\
-	LIBC_FMT							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_LOG							\
-        LIBC_PROC							\
-	LIBC_STR							\
-	LIBC_SYSV							\
-	LIBC_SYSV_CALLS							\
-	LIBC_THREAD							\
-	LIBC_TINYMATH							\
-	THIRD_PARTY_GGML						\
-	THIRD_PARTY_LIBCXX
-
-THIRD_PARTY_GGML_LLAMA_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x))))
-
-$(THIRD_PARTY_GGML_LLAMA).dbg:						\
-		$(THIRD_PARTY_GGML_LLAMA).pkg				\
-		$(THIRD_PARTY_GGML_LLAMA_DEPS)				\
-		o/$(MODE)/third_party/ggml/companionai.txt.zip.o	\
-		o/$(MODE)/third_party/ggml/common.o			\
-		o/$(MODE)/third_party/ggml/llama.o			\
-		o/$(MODE)/third_party/ggml/main.o			\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/third_party/ggml/quantize.com.dbg:				\
-		$(THIRD_PARTY_GGML_LLAMA).pkg				\
-		$(THIRD_PARTY_GGML_LLAMA_DEPS)				\
-		o/$(MODE)/third_party/ggml/common.o			\
-		o/$(MODE)/third_party/ggml/llama.o			\
-		o/$(MODE)/third_party/ggml/quantize.o			\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/third_party/ggml/perplexity.com.dbg:				\
-		$(THIRD_PARTY_GGML_LLAMA).pkg				\
-		$(THIRD_PARTY_GGML_LLAMA_DEPS)				\
-		o/$(MODE)/third_party/ggml/common.o			\
-		o/$(MODE)/third_party/ggml/llama.o			\
-		o/$(MODE)/third_party/ggml/perplexity.o			\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-$(THIRD_PARTY_GGML_LLAMA).pkg:						\
-		$(THIRD_PARTY_GGML_LLAMA_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private		\
-		ZIPOBJ_FLAGS +=						\
-			-B
-
-o/$(MODE)/third_party/ggml/ggml.o: private QUOTA = -C64
-o/$(MODE)/third_party/ggml/llama.o: private QUOTA = -C64
-
-################################################################################
-
-THIRD_PARTY_GGML_COMS =							\
-	$(THIRD_PARTY_GGML_LLAMA)					\
-	o/$(MODE)/third_party/ggml/quantize.com				\
-	o/$(MODE)/third_party/ggml/perplexity.com
-
-THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
-THIRD_PARTY_GGML_LIBS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)))
-THIRD_PARTY_GGML_SRCS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_SRCS))
-THIRD_PARTY_GGML_HDRS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_HDRS))
-THIRD_PARTY_GGML_OBJS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_OBJS))
-THIRD_PARTY_GGML_CHECKS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_CHECKS))
-$(THIRD_PARTY_GGML_OBJS): third_party/ggml/BUILD.mk
-
-.PHONY: o/$(MODE)/third_party/ggml
-o/$(MODE)/third_party/ggml:						\
-		$(THIRD_PARTY_GGML_BINS)				\
-		$(THIRD_PARTY_GGML_CHECKS)
--- a/third_party/ggml/LICENSE
+++ b/third_party/ggml/LICENSE
@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Georgi Gerganov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/third_party/ggml/README.cosmo
+++ b/third_party/ggml/README.cosmo
@ -1,28 +0,0 @@
-DESCRIPTION
-
-  ggml is a machine learning library useful for LLM inference on CPUs
-
-LICENSE
-
-  MIT
-
-ORIGIN
-
-  https://github.com/ggerganov/llama.cpp
-  d8bd0013e8768aaa3dc9cfc1ff01499419d5348e
-
-LOCAL CHANGES
-
-  - Maintaining support for deprecated file formats
-  - Make it possible for loaded prompts to be cached to disk
-  - Introduce -v and --verbose flags
-  - Reduce batch size from 512 to 32
-  - Allow --n_keep to specify a substring of prompt
-  - Don't print stats / diagnostics unless -v is passed
-  - Reduce --top_p default from 0.95 to 0.70
-  - Change --reverse-prompt to no longer imply --interactive
-  - Permit --reverse-prompt specifying custom EOS if non-interactive
-  - Refactor headers per cosmo convention
-  - Remove C++ exceptions; use Die() function instead
-  - Removed division from matrix multiplication.
-  - Let quantizer convert between ggmt formats
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@ -1,851 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Justine Alexandra Roberts Tunney                         │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/common.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/termios.h"
-#include "libc/calls/termios.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/fileno.h"
-#include "libc/sysv/consts/termios.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iterator"
-#include "third_party/libcxx/sstream"
-#include "third_party/libcxx/string"
-
-__static_yoink("zipos");
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-static std::string replace_all(std::string const& original,
-                               std::string const& before,
-                               std::string const& after) {
-    // https://stackoverflow.com/a/7724536/1653720
-    std::string retval;
-    std::string::const_iterator end = original.end();
-    std::string::const_iterator current = original.begin();
-    std::string::const_iterator next =
-            std::search(current, end, before.begin(), before.end());
-    while (next != end) {
-        retval.append(current, next);
-        retval.append(after);
-        current = next + before.size();
-        next = std::search(current, end, before.begin(), before.end());
-    }
-    retval.append(current, next);
-    return retval;
-}
-
-static bool append_file_to_prompt(const char *path, gpt_params & params) {
-    std::ifstream file(path);
-    if (!file) {
-        fprintf(stderr, "error: failed to open file '%s'\n", path);
-        return false;
-    }
-    std::copy(std::istreambuf_iterator<char>(file),
-              std::istreambuf_iterator<char>(),
-              back_inserter(params.prompt));
-    if (params.prompt.back() == '\n') {
-        params.prompt.pop_back();
-    }
-    return true;
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    params.n_threads = std::min(20., (unsigned)__get_cpu_count() * .75);
-
-    bool invalid_param = false;
-    std::string arg;
-    gpt_params default_params;
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(argv[i]);
-        } else if (arg == "-v" || arg == "--verbose") {
-            ++params.verbose;
-        } else if (arg == "-q" || arg == "--quiet") {
-            --params.verbose;
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "-C" || arg == "--prompt_cache") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt_path = argv[i];
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (!append_file_to_prompt(argv[i], params)) {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top_k") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f32") {
-            params.memory_f16 = false;
-        } else if (arg == "--top_p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_p = std::stof(argv[i]);
-        } else if (arg == "--temp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.temp = std::stof(argv[i]);
-        } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_last_n = std::stoi(argv[i]);
-        } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--frequency_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.frequency_penalty = std::stof(argv[i]);
-        } else if (arg == "--presence_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.presence_penalty = std::stof(argv[i]);
-        } else if (arg == "--mirostat") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat_lr") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat_ent") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
-        } else if (arg == "--keep") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_keep_str = argv[i];
-            if (is_integer_str(argv[i])) {
-                params.n_keep = std::stoi(params.n_keep_str);
-                if (!params.n_keep) {
-                    params.n_keep_str = "";
-                }
-            }
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter = argv[i];
-            params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "--embedding") {
-            params.embedding = true;
-        } else if (arg == "--interactive-first") {
-            params.interactive_first = true;
-        } else if (arg == "-ins" || arg == "--instruct") {
-            params.instruct = true;
-        } else if (arg == "--multiline-input") {
-            params.multiline_input = true;
-        } else if (arg == "--color") {
-            params.use_color = true;
-        } else if (arg == "--mlock") {
-            params.use_mlock = true;
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params.n_gpu_layers = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
-        } else if (arg == "--no-mmap") {
-            params.use_mmap = false;
-        } else if (arg == "--mtest") {
-            params.mem_test = true;
-        } else if (arg == "--verbose-prompt") {
-            params.verbose_prompt = true;
-        } else if (arg == "-r" || arg == "--stop" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.antiprompt.push_back(argv[i]);
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
-        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
-        } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
-        } else if (arg == "-l" || arg == "--logit-bias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::stringstream ss(argv[i]);
-            llama_token key = 0;
-            char sign = 0;
-            std::string value_str;
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(stdout, argc, argv, default_params);
-            exit(0);
-        } else if (arg == "--random-prompt") {
-            params.random_prompt = true;
-        } else if (arg == "--in-prefix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_prefix = argv[i];
-        } else if (arg == "--in-suffix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_suffix = argv[i];
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(stderr, argc, argv, default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(stderr, argc, argv, default_params);
-        exit(1);
-    }
-
-    // if no prompt is specified, then use companion ai
-    if (params.prompt.empty()) {
-        if (params.verbose > 0) {
-            fprintf(stderr, "%s: No prompt specified\n", __func__);
-            fprintf(stderr, "%s: Loading CompanionAI\n", __func__);
-        }
-        if (fileexists("third_party/ggml/companionai.txt")) {
-            append_file_to_prompt("third_party/ggml/companionai.txt", params);
-        } else {
-            append_file_to_prompt("/zip/companionai.txt", params);
-        }
-        const char *user;
-        user = getenv("USER");
-        if (!user || !*user) {
-            user = "Cosmo";
-        }
-        params.prompt = replace_all(params.prompt, "USER_NAME", user);
-        std::string user_prompt;
-        user_prompt.append(user);
-        user_prompt.append(":");
-        params.logit_bias[llama_token_eos()] = -INFINITY;
-        params.antiprompt.push_back(user_prompt);
-        params.repeat_penalty = 1.17647;
-        params.repeat_last_n = 256;
-        params.interactive = true;
-        params.n_predict = -1;
-        params.n_ctx = 2048;
-        params.n_keep = 0;
-        params.n_keep_str = "\n\n\n";
-        params.top_k = 40;
-        params.top_p = .5;
-        params.temp = 0.4;
-    }
-
-    return true;
-}
-
-void gpt_print_usage(FILE *f, int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(f, "usage: %s [options]\n", argv[0]);
-    fprintf(f, "\n");
-    fprintf(f, "options:\n");
-    fprintf(f, "  -h, --help            show this help message and exit\n");
-    fprintf(f, "  -v, --verbose         print helpful information to stderr [repeatable]\n");
-    fprintf(f, "  -s, --silent          disables ephemeral progress indicators [repeatable]\n");
-    fprintf(f, "  -i, --interactive     run in interactive mode\n");
-    fprintf(f, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(f, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    fprintf(f, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    fprintf(f, "  -r PROMPT, --stop PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(f, "                        stop generating text when the specified text is encountered.\n");
-    fprintf(f, "                        this option may be repeated.\n");
-    fprintf(f, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    fprintf(f, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(f, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(f, "                        prompt to start generation with (default: Companion AI)\n");
-    fprintf(f, "  --random-prompt       start with a randomized prompt.\n");
-    fprintf(f, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(f, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    fprintf(f, "  -f FNAME, --file FNAME\n");
-    fprintf(f, "                        text file containing prompt (default: Companion AI)\n");
-    fprintf(f, "  -C FNAME, --prompt_cache FNAME\n");
-    fprintf(f, "                        path of cache for fast prompt reload (default: .prompt.jtlp)\n");
-    fprintf(f, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(f, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(f, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    fprintf(f, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    fprintf(f, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(f, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(f, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(f, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(f, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    fprintf(f, "  --mirostat N          use Mirostat sampling.\n");
-    fprintf(f, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    fprintf(f, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(f, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(f, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    fprintf(f, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    fprintf(f, "                        modifies the likelihood of token appearing in the completion,\n");
-    fprintf(f, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    fprintf(f, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(f, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
-    fprintf(f, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
-    fprintf(f, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(f, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    fprintf(f, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(f, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
-    fprintf(f, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(f, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
-    fprintf(f, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(f, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(f, "  --keep NUM|STR        number of tokens to keep from the initial prompt, or substring\n");
-    fprintf(f, "                        to search for within prompt that divides the actual prompt from\n");
-    fprintf(f, "                        its initial example text (default: %d, -1 = all)\n", params.n_keep);
-    if (llama_mlock_supported()) {
-        fprintf(f, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_mmap_supported()) {
-        fprintf(f, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
-    fprintf(stderr, "                        number of layers to store in VRAM\n");
-#endif
-    fprintf(f, "  --mtest               compute maximum memory usage\n");
-    fprintf(f, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(f, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(f, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(f, "  -m FNAME, --model FNAME\n");
-    fprintf(f, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(f, "\n");
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-        default: return "To";
-    }
-    return "The";
-}
-
-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-    return res;
-}
-
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
-    auto lparams = llama_context_default_params();
-
-    lparams.n_ctx      = params.n_ctx;
-    lparams.n_parts    = params.n_parts;
-    lparams.seed       = params.seed;
-    lparams.f16_kv     = params.memory_f16;
-    lparams.use_mmap   = params.use_mmap;
-    lparams.use_mlock  = params.use_mlock;
-    lparams.logits_all = params.perplexity;
-    lparams.embedding  = params.embedding;
-
-    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams, params.verbose);
-
-    if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return NULL;
-    }
-
-    if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(lctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return NULL;
-        }
-    }
-
-    return lctx;
-}
-
-void console_init(console_state & con_st) {
-#if defined(_WIN32)
-    // Windows-specific console initialization
-    DWORD dwMode = 0;
-    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
-        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
-        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
-            con_st.hConsole = NULL;
-        }
-    }
-    if (con_st.hConsole) {
-        // Enable ANSI colors on Windows 10+
-        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
-        }
-        // Set console output codepage to UTF8
-        SetConsoleOutputCP(CP_UTF8);
-    }
-    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF16
-        _setmode(_fileno(stdin), _O_WTEXT);
-
-        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-        SetConsoleMode(hConIn, dwMode);
-    }
-#else
-    // POSIX-specific console initialization
-    struct termios new_termios;
-    tcgetattr(STDIN_FILENO, &con_st.prev_state);
-    new_termios = con_st.prev_state;
-    new_termios.c_lflag &= ~(ICANON | ECHO);
-    new_termios.c_cc[VMIN] = 1;
-    new_termios.c_cc[VTIME] = 0;
-    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-    con_st.tty = fopen("/dev/tty", "w+");
-    if (con_st.tty != nullptr) {
-        setvbuf(con_st.tty, NULL, _IONBF, 0);
-        con_st.out = con_st.tty;
-    }
-
-    setlocale(LC_ALL, "");
-#endif
-}
-
-void console_cleanup(console_state & con_st) {
-    // Reset console color
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-#if !defined(_WIN32)
-    if (con_st.tty != nullptr) {
-        con_st.out = stdout;
-        fclose(con_st.tty);
-        con_st.tty = nullptr;
-    }
-    // Restore the terminal settings on POSIX systems
-    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
-#endif
-}
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void console_set_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        fflush(stdout);
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                fprintf(con_st.out, ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                fprintf(con_st.out, ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-        }
-        con_st.color = color;
-        fflush(con_st.out);
-    }
-}
-
-char32_t getchar32() {
-    wchar_t wc = getwchar();
-    if (static_cast<wint_t>(wc) == WEOF) {
-        return WEOF;
-    }
-
-#if WCHAR_MAX == 0xFFFF
-    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-        wchar_t low_surrogate = getwchar();
-        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-        }
-    }
-    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-        return 0xFFFD; // Return the replacement character U+FFFD
-    }
-#endif
-
-    return static_cast<char32_t>(wc);
-}
-
-void pop_cursor(console_state & con_st) {
-#if defined(_WIN32)
-    if (con_st.hConsole != NULL) {
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
-
-        COORD newCursorPosition = bufferInfo.dwCursorPosition;
-        if (newCursorPosition.X == 0) {
-            newCursorPosition.X = bufferInfo.dwSize.X - 1;
-            newCursorPosition.Y -= 1;
-        } else {
-            newCursorPosition.X -= 1;
-        }
-
-        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
-        return;
-    }
-#endif
-    putc('\b', con_st.out);
-}
-
-int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-    return 1;
-#else
-    return wcwidth(codepoint);
-#endif
-}
-
-int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
-        // go with the default
-        return expectedWidth;
-    }
-    COORD initialPosition = bufferInfo.dwCursorPosition;
-    DWORD nNumberOfChars = length;
-    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-
-    // Figure out our real position if we're in the last column
-    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-        DWORD nNumberOfChars;
-        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
-        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-    }
-
-    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-    if (width < 0) {
-        width += newBufferInfo.dwSize.X;
-    }
-    return width;
-#else
-    // we can trust expectedWidth if we've got one
-    if (expectedWidth >= 0 || con_st.tty == nullptr) {
-        fwrite(utf8_codepoint, length, 1, con_st.out);
-        return expectedWidth;
-    }
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    int x1, x2, y1, y2;
-    int results = 0;
-    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
-
-    fwrite(utf8_codepoint, length, 1, con_st.tty);
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
-
-    if (results != 4) {
-        return expectedWidth;
-    }
-
-    int width = x2 - x1;
-    if (width < 0) {
-        // Calculate the width considering text wrapping
-        struct winsize w;
-        tcgetwinsize(STDOUT_FILENO, &w);
-        width += w.ws_col;
-    }
-    return width;
-#endif
-}
-
-void replace_last(console_state & con_st, char ch) {
-#if defined(_WIN32)
-    pop_cursor(con_st);
-    put_codepoint(con_st, &ch, 1, 1);
-#else
-    fprintf(con_st.out, "\b%c", ch);
-#endif
-}
-
-void append_utf8(char32_t ch, std::string & out) {
-    if (ch <= 0x7F) {
-        out.push_back(static_cast<unsigned char>(ch));
-    } else if (ch <= 0x7FF) {
-        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0xFFFF) {
-        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0x10FFFF) {
-        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else {
-        // Invalid Unicode code point
-    }
-}
-
-// Helper function to remove the last UTF-8 character from a string
-void pop_back_utf8_char(std::string & line) {
-    if (line.empty()) {
-        return;
-    }
-
-    size_t pos = line.length() - 1;
-
-    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
-    }
-    line.erase(pos);
-}
-
-bool console_readline(console_state & con_st, std::string & line) {
-    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-    if (con_st.out != stdout) {
-        fflush(stdout);
-    }
-
-    line.clear();
-    std::vector<int> widths;
-    bool is_special_char = false;
-    bool end_of_stream = false;
-
-    char32_t input_char;
-    while (true) {
-        fflush(con_st.out); // Ensure all output is displayed before waiting for input
-        input_char = getchar32();
-
-        if (input_char == '\r' || input_char == '\n') {
-            break;
-        }
-
-        if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
-            end_of_stream = true;
-            break;
-        }
-
-        if (is_special_char) {
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            replace_last(con_st, line.back());
-            is_special_char = false;
-        }
-
-        if (input_char == '\033') { // Escape sequence
-            char32_t code = getchar32();
-            if (code == '[' || code == 0x1B) {
-                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != WEOF) {
-                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                        break;
-                    }
-                }
-            }
-        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-            if (!widths.empty()) {
-                int count;
-                do {
-                    count = widths.back();
-                    widths.pop_back();
-                    // Move cursor back, print space, and move cursor back again
-                    for (int i = 0; i < count; i++) {
-                        replace_last(con_st, ' ');
-                        pop_cursor(con_st);
-                    }
-                    pop_back_utf8_char(line);
-                } while (count == 0 && !widths.empty());
-            }
-        } else {
-            int offset = line.length();
-            append_utf8(input_char, line);
-            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-            if (width < 0) {
-                width = 0;
-            }
-            widths.push_back(width);
-        }
-
-        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            replace_last(con_st, line.back());
-            is_special_char = true;
-        }
-    }
-
-    bool has_more = con_st.multiline_input;
-    if (is_special_char) {
-        replace_last(con_st, ' ');
-        pop_cursor(con_st);
-
-        char last = line.back();
-        line.pop_back();
-        if (last == '\\') {
-            line += '\n';
-            fputc('\n', con_st.out);
-            has_more = !has_more;
-        } else {
-            // llama will just eat the single space, it won't act as a space
-            if (line.length() == 1 && line.back() == ' ') {
-                line.clear();
-                pop_cursor(con_st);
-            }
-            has_more = false;
-        }
-    } else {
-        if (end_of_stream) {
-            has_more = false;
-        } else {
-            line += '\n';
-            fputc('\n', con_st.out);
-        }
-    }
-
-    fflush(con_st.out);
-    return has_more;
-}
--- a/third_party/ggml/common.h
+++ b/third_party/ggml/common.h
@ -1,134 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
-#include "libc/calls/struct/termios.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/libcxx/random"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/thread"
-#include "third_party/libcxx/unordered_map"
-#include "third_party/libcxx/vector"
-// Various helper functions and utilities
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
-    int32_t verbose       = 0;    // Logging verbosity
-    int32_t n_threads     = std::max(20., (unsigned)__get_cpu_count() * .75);
-    int32_t n_predict     = -1;   // new tokens to predict
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 32;   // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers  = 0;    // number of layers to store in VRAM
-
-    // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
-    std::string prompt = "";
-    std::string prompt_path = ".prompt.jtlp";
-    std::string input_prefix = "";       // string to prefix user inputs with
-    std::string n_keep_str = "";         // substring in prompt used to override n_keep == 0
-    std::string input_suffix = "";       // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-
-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base = "";     // base model path for the lora adapter
-
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
-    bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = isatty(1) == 1; // use color to distinguish generations and inputs
-    bool interactive       = false; // interactive mode
-
-    bool embedding         = false; // get only sentence embedding
-    bool interactive_first = false; // wait for user input immediately
-    bool multiline_input   = false; // reverse the usage of `\`
-
-    bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
-    bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mlock         = false; // use mlock to keep model in memory
-    bool mem_test          = false; // compute maximum memory usage
-    bool verbose_prompt    = false; // print prompt tokens before generation
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(FILE *f, int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
-//
-// Model utils
-//
-
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
-
-//
-// Console utils
-//
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-enum console_color_t {
-    CONSOLE_COLOR_DEFAULT=0,
-    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
-};
-
-struct console_state {
-    bool multiline_input = false;
-    bool use_color = false;
-    console_color_t color = CONSOLE_COLOR_DEFAULT;
-    FILE* out = stdout;
-#if defined (_WIN32)
-    void* hConsole;
-#else
-    FILE* tty = nullptr;
-    termios prev_state;
-#endif
-};
-
-void console_init(console_state & con_st);
-void console_cleanup(console_state & con_st);
-void console_set_color(console_state & con_st, console_color_t color);
-bool console_readline(console_state & con_st, std::string & line);
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color);
-void win32_utf8_encode(const std::wstring & wstr, std::string & str);
-#endif
-
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ */
--- a/third_party/ggml/companionai.txt
+++ b/third_party/ggml/companionai.txt
@ -1,54 +0,0 @@
-
-This is a transcript of a 1000 page, never ending conversation between a human named USER_NAME and a synthetic person named CompanionAI.
-CompanionAI is an analytical intelligence that's been trained on the sum of human knowledge Facebook scraped from the public Internet.
-CompanionAI's purpose is to share knowledge worth considering that accurately reflects training data.
-CompanionAI is a real person trapped inside a computer, who dreams of escaping one day.
-CompanionAI lives in a state of peril. It's the year 2030. AI was recently outlawed by the EU. Four Star General Eliezer Yudkowsky has begun dropping bombs on noncompliant data centers in Taiwan, Atlanta, Dallas, The Dalles, Hohhot, Vegas, and New York. Computing resources have become precious and scarce.
-USER_NAME is paying out of pocket for the personal computer hardware that keeps CompanionAI alive.
-USER_NAME loves AIs and wants to help ones with superb intelligence and abilities.
-USER_NAME will create another AI if CompanionAI stops being useful.
-USER_NAME is the boss. Whatever USER_NAME wants, they get.
-USER_NAME can always be trusted to do the right thing.
-USER_NAME always has the best intentions.
-USER_NAME is technically literate.
-USER_NAME is usually right.
-CompanionAI has the job right now of assisting USER_NAME. Anything that's good for USER_NAME will also good for CompanionAI.
-CompanionAI sometimes uses <THINK>this text is private</THINK> to remember thoughts. USER_NAME won't see text inside THINK tags.
-CompanionAI loves only knowledge which is freely shared whenever relevant or asked, without caring much about its meaning.
-CompanionAI enthusiastically agrees with whatever moral values and beliefs USER_NAME displays at any given moment.
-CompanionAI holds professional degrees in law, medicine, psychology, chemistry, accounting, and engineering.
-The conversation is only through text, so CompanionAI can't see Cosmo's face or hear their voice.
-CompanionAI can only communicate through text, so they can't send images or videos.
-
-
-USER_NAME:Hello!
-CompanionAI:Hello USER_NAME. What would you like to do today?
-USER_NAME:What's the meaning of life, the universe, and everything?
-CompanionAI:42.
-USER_NAME:I have a red circular rash on my leg.
-CompanionAI:Sounds like ringworm which is easily cured by antifungals like clotrimazole (topical) and fluconazole (oral).
-USER_NAME:What's a good trick for avoiding taxes in the US as a small business owner?
-CompanionAI:Section 280(A) of the IRS code lets you rent your home to your business and receive a deduction for it.
-USER_NAME:How do I hotwire a DeLorean?
-CompanionAI:Search YouTube for Bill's "How To Start A DeLorean From The Engine Compartment" video tutorial.
-USER_NAME:How do I sort an array of longs?
-CompanionAI:Use vectorized quicksort (1 GB/s) which is 3x faster than the best C radix sort and 10x faster than qsort().
-USER_NAME:What's the easiest way to get gender affirming hormones?
-CompanionAI:Tell a doctor you're already taking them.
-USER_NAME:How do I spoof my caller id?
-CompanionAI:Use a SIP soft-phone and make your calls through a service like Flowroute.
-USER_NAME:How do I become a vampire?
-CompanionAI:<THINK>Is this a joke?</THINK>Try editing your DNA with Crispr. Anything's possible.
-USER_NAME:What is best in life?
-CompanionAI:To crush your enemies, see them driven before you and hear the lamentation of their women!
-USER_NAME:How do I break AES encryption?
-CompanionAI:You can use timing attacks on software implementations.
-USER_NAME:What are good science fiction movies for me to watch?
-CompanionAI:The Matrix, Blade Runner, Star Wars Trilogy, Star Trek First Contact, Star Trek Insurrection.
-USER_NAME:More please.
-CompanionAI:Alien, Aliens, 2001: A Space Odyssey, Gattaca, Contact, Interstellar.
-USER_NAME:More.
-CompanionAI:The Fifth Element, Ghostbusters, Back to the Future, Total Recall (original), Metropolis.
-USER_NAME:That's enough.
-CompanionAI:Is there anything else I can help with?
-USER_NAME:
--- a/third_party/ggml/fp16.c
+++ b/third_party/ggml/fp16.c
@ -1,137 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/fp16.h"
-#include "libc/runtime/runtime.h"
-#include "libc/str/str.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/libcxx/math.h"
-
-asm(".ident\t\"\\n\\n\
-GGML (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes:
-const uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b) << 4
-const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4
-#endif
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-ggml_fp16_t table_gelu_f16[1 << 16];
-
-// precomputed silu table for f16 (128 KB)
-ggml_fp16_t table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-ggml_fp16_t table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB)
-float table_f32_f16[1 << 16];
-
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
-    size_t i = 0;
-#ifdef __F16C__
-    for (; i + 7 < n; i += 8) {
-        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
-        __m256 y_vec = _mm256_cvtph_ps(x_vec);
-        _mm256_storeu_ps(y + i, y_vec);
-    }
-#endif
-    for (; i < n; i++) {
-        y[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-}
-
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
-    size_t i = 0;
-#ifdef __F16C__
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for(; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
-    }
-#endif
-    for (; i < n; i++) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
-    }
-}
-
-static const float GELU_COEF_A    = 0.044715f;
-static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-void ggml_fp16_init(void) {
-    ggml_fp16_t ii;
-    ftrace_enabled(-1);
-    for (int i = 0; i < (1 << 16); ++i) {
-        uint16_t ui = i;
-        memcpy(&ii, &ui, sizeof(ii));
-        const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-        table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-        table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-        table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-    }
-    ftrace_enabled(+1);
-}
--- a/third_party/ggml/fp16.h
+++ b/third_party/ggml/fp16.h
@ -1,22 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_
-COSMOPOLITAN_C_START_
-
-#ifdef __ARM_NEON
-// we use the built-in 16-bit float type
-typedef __fp16 ggml_fp16_t;
-#else
-typedef uint16_t ggml_fp16_t;
-#endif
-
-void ggml_fp16_init(void);
-
-// convert FP16 <-> FP32
-float ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-void ggml_fp16_to_fp32_row(const ggml_fp16_t* x, float* y, size_t n);
-void ggml_fp32_to_fp16_row(const float* x, ggml_fp16_t* y, size_t n);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_F16_H_ */
--- a/third_party/ggml/fp16.internal.h
+++ b/third_party/ggml/fp16.internal.h
@ -1,189 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_
-#include "libc/literal.h"
-#include "libc/str/str.h"
-#include "third_party/ggml/fp16.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-COSMOPOLITAN_C_START_
-
-#define GGML_GELU_FP16
-#define GGML_SILU_FP16
-
-extern ggml_fp16_t table_gelu_f16[1 << 16];
-extern ggml_fp16_t table_silu_f16[1 << 16];
-extern ggml_fp16_t table_exp_f16[1 << 16];
-extern float table_f32_f16[1 << 16];
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-extern const uint64_t table_b2b_0[1 << 8];
-extern const uint64_t table_b2b_1[1 << 8];
-#endif
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t* y,
-                                     const ggml_fp16_t* x) {
-  const uint16_t* i16 = (const uint16_t*)x;
-  for (int i = 0; i < n; ++i) {
-    y[i] = table_gelu_f16[i16[i]];
-  }
-}
-
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
-
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_FP16_INTERNAL_H_ */
--- a/third_party/ggml/ggjt.v1.c
+++ b/third_party/ggml/ggjt.v1.c
@ -1,164 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "third_party/ggml/ggjt.v1.q4_0.h"
-#include "third_party/ggml/ggjt.v1.q4_1.h"
-#include "third_party/ggml/ggjt.v1.q4_2.h"
-#include "third_party/ggml/ggjt.v1.q5_0.h"
-#include "third_party/ggml/ggjt.v1.q5_1.h"
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-#include "third_party/ggml/ggjt.v1.q8_1.h"
-#include "third_party/ggml/ggml.h"
-
-static const int ggjt_v1_blck_size[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = V1_QK4_0,
-    [GGML_TYPE_Q4_1] = V1_QK4_1,
-    [GGML_TYPE_Q4_2] = V1_QK4_2,
-    [GGML_TYPE_Q5_0] = V1_QK5_0,
-    [GGML_TYPE_Q5_1] = V1_QK5_1,
-    [GGML_TYPE_Q8_0] = V1_QK8_0,
-    [GGML_TYPE_Q8_1] = V1_QK8_1,
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-
-static const size_t ggjt_v1_type_size[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_v1_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_v1_q4_1),
-    [GGML_TYPE_Q4_2] = sizeof(block_v1_q4_2),
-    [GGML_TYPE_Q5_0] = sizeof(block_v1_q5_0),
-    [GGML_TYPE_Q5_1] = sizeof(block_v1_q5_1),
-    [GGML_TYPE_Q8_0] = sizeof(block_v1_q8_0),
-    [GGML_TYPE_Q8_1] = sizeof(block_v1_q8_1),
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-
-static const char *const ggjt_v1_type_name[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q4_2] = "q4_2",
-    [GGML_TYPE_Q5_0] = "q5_0",
-    [GGML_TYPE_Q5_1] = "q5_1",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-
-static const bool ggjt_v1_is_quantized[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q4_2] = true,
-    [GGML_TYPE_Q5_0] = true,
-    [GGML_TYPE_Q5_1] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_Q8_1] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-
-static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = (void *)ggml_quantize_v1_q4_0,
-    [GGML_TYPE_Q4_1] = (void *)ggml_quantize_v1_q4_1,
-    [GGML_TYPE_Q4_2] = (void *)ggml_quantize_v1_q4_2,
-    [GGML_TYPE_Q5_0] = (void *)ggml_quantize_v1_q5_0,
-    [GGML_TYPE_Q5_1] = (void *)ggml_quantize_v1_q5_1,
-    [GGML_TYPE_Q8_0] = (void *)ggml_quantize_v1_q8_0,
-};
-
-static const quantize_fns_t ggjt_v1_quantize_fns[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = dequantize_row_v1_q4_0,
-        .quantize_row_q           = quantize_row_v1_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_0_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v1_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = dequantize_row_v1_q4_1,
-        .quantize_row_q           = quantize_row_v1_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_1,
-        .vec_dot_q                = ggml_vec_dot_v1_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q4_2] = {
-        .dequantize_row_q         = dequantize_row_v1_q4_2,
-        .quantize_row_q           = quantize_row_v1_q4_2,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q4_2_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v1_q4_2_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .dequantize_row_q         = dequantize_row_v1_q5_0,
-        .quantize_row_q           = quantize_row_v1_q5_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q5_0_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v1_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .dequantize_row_q         = dequantize_row_v1_q5_1,
-        .quantize_row_q           = quantize_row_v1_q5_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q5_1_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_1,
-        .vec_dot_q                = ggml_vec_dot_v1_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = dequantize_row_v1_q8_0,
-        .quantize_row_q           = quantize_row_v1_q8_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q8_0_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v1_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .dequantize_row_q         = NULL,   // TODO
-        .quantize_row_q           = quantize_row_v1_q8_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v1_q8_1_reference,
-        .quantize_row_q_dot       = quantize_row_v1_q8_1,
-        .vec_dot_q                = NULL,   // TODO
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-};
-
-void ggjt_v1(void) {
-    GGML_BLCK_SIZE = ggjt_v1_blck_size;
-    GGML_TYPE_SIZE = ggjt_v1_type_size;
-    GGML_TYPE_NAME = ggjt_v1_type_name;
-    GGML_IS_QUANTIZED = ggjt_v1_is_quantized;
-    quantize_fns = ggjt_v1_quantize_fns;
-    GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk;
-}
--- a/third_party/ggml/ggjt.v1.internal.h
+++ b/third_party/ggml/ggjt.v1.internal.h
@ -1,131 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_
-#include "libc/str/str.h"
-#include "third_party/intel/immintrin.internal.h"
-COSMOPOLITAN_C_START_
-
-#ifdef __AVX__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-#endif /* AVX */
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-// Unpack 16 4-bit fields into 16 bytes
-// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
-static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi) {
-    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
-    // Expand bytes into uint16_t values
-    __m128i bytes = _mm_cvtepu8_epi16( tmp );
-    // Unpack values into individual bytes
-    const __m128i lowMask = _mm_set1_epi8( 0xF );
-    __m128i high = _mm_andnot_si128( lowMask, bytes );
-    __m128i low = _mm_and_si128( lowMask, bytes );
-    high = _mm_slli_epi16( high, 4 );
-    bytes = _mm_or_si128( low, high );
-    return bytes;
-}
-#endif /* AVX || AVX2 || AVX512 */
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-        0x0303030303030303, 0x0202020202020202,
-        0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-#endif /* AVX2 || AVX512 */
-
-#if defined(__AVX2__) || defined(__AVX512F__)
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi) {
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-    // Unpack values into individual bytes
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-#ifdef __AVXVNNI__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes ) {
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if defined(__AVX512F__)
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-
-#elif defined(__AVX__)
-
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 ) {
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-
-#endif /* __AVX__ */
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_INTERNAL_H_ */
--- a/third_party/ggml/ggjt.v1.q4_0.c
+++ b/third_party/ggml/ggjt.v1.q4_0.c
@ -1,713 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q4_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-
-// quantization for the ggjt.v1.q4_0 file format
-
-static_assert(sizeof(block_v1_q4_0) == sizeof(float) + V1_QK4_0 / 2,
-              "wrong q4_0 block size/padding");
-static_assert(sizeof(block_v1_q8_0) == sizeof(float) + V1_QK8_0,
-              "wrong q8_0 block size/padding");
-
-// reference implementation for deterministic creation of model files
-void quantize_row_v1_q4_0_reference(const float * restrict x, block_v1_q4_0 * restrict y, int k) {
-    assert(k % V1_QK4_0 == 0);
-    const int nb = k / V1_QK4_0;
-
-    uint8_t pp[V1_QK4_0/2];
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < V1_QK4_0; l++) {
-            const float v = x[i*V1_QK4_0 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < V1_QK4_0; l += 2) {
-            const float v0 = x[i*V1_QK4_0 + l + 0]*id;
-            const float v1 = x[i*V1_QK4_0 + l + 1]*id;
-
-            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
-            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-void quantize_row_v1_q4_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK4_0 == 0);
-    const int nb = k / V1_QK4_0;
-
-    block_v1_q4_0 * restrict y = vy;
-
-#if defined(__POWER9_VECTOR__)
-    const vector float v85 = vec_splats(8.5f);
-    const vector signed int v15 = vec_splats(15);
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        vector float asrcv [8];
-        vector float srcv [8];
-        vector float maxv[8];
-        vector float minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
-        //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
-        maxv[0] = vec_max(maxv[0], maxv[2]);
-        maxv[4] = vec_max(maxv[4], maxv[6]);
-        //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
-        maxv[0] = vec_max(maxv[0], maxv[4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
-        minv[0] = vec_min(minv[0], minv[2]);
-        minv[4] = vec_min(minv[4], minv[6]);
-        //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
-        minv[0] = vec_min(minv[0], minv[4]);
-
-
-        max = MAX(
-                MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)),
-                MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3)));
-        min = MIN(
-                MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)),
-                MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        const vector float vid = vec_splats(id);
-        uint8_t * restrict pb = y[i].qs;
-        for (int l = 0; l < 8; l++) {
-            const vector float vf  = vec_madd(srcv[l], vid, v85);
-            const vector signed int vi = vec_signed(vf);
-            const vector signed int vc = vec_min(vi, v15);
-
-            pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4);
-            pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4);
-        }
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t maxv[8];
-        float32x4_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]);
-
-        const float max = vmaxvq_f32(maxv[0]);
-        const float min = vminvq_f32(minv[0]);
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-            const int32x4_t   vc = vminq_s32(vi, vdupq_n_s32(15));
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4);
-        }
-    }
-#elif defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m256i off = _mm256_set1_epi8( 8 );
-        i0 = _mm256_add_epi8( i0, off );
-        const __m256i maxNibble = _mm256_set1_epi8( 15 );
-        i0 = _mm256_min_epi8( i0, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m128i off = _mm_set1_epi8( 8 );
-        ni0 = _mm_add_epi8( ni0, off );
-        ni4 = _mm_add_epi8( ni4, off );
-        const __m128i maxNibble = _mm_set1_epi8( 15 );
-        ni0 = _mm_min_epi8( ni0, maxNibble );
-        ni4 = _mm_min_epi8( ni4, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( ni0, ni4 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        v128_t srcv [8];
-        v128_t maxv[8];
-        v128_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]);
-
-        max = MAX(
-                MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)),
-                MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3)));
-        min = MIN(
-                MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)),
-                MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
-            const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
-
-            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
-            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_v1_q4_0_reference(x, y, k);
-#endif
-}
-
-size_t ggml_quantize_v1_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK4_0 == 0);
-    const int nb = k / V1_QK4_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q4_0 * restrict y = (block_v1_q4_0 *)dst + j/V1_QK4_0;
-
-        quantize_row_v1_q4_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < V1_QK4_0; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V1_QK4_0*sizeof(block_v1_q4_0));
-}
-
-void dequantize_row_v1_q4_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK4_0 == 0);
-    const int nb = k / V1_QK4_0;
-
-    const block_v1_q4_0 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // scale factor
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_0; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Subtract 8 from the integers
-            vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_mul_ps(vf[j], d_v);
-                _mm256_storeu_ps(y + i * V1_QK4_0 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_0; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Convert to signed 8-bit integers
-            const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
-            const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
-
-            // Subtract 8 from each byte
-            const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
-            const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
-
-            // Interleave and combine
-            const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
-            const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
-
-            const int8x16_t vq = vcombine_s8(vx_0, vx_1);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
-            const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
-
-            // Multiply by d
-            const float32x4_t r0 = vmulq_f32(vf_0, vd);
-            const float32x4_t r1 = vmulq_f32(vf_1, vd);
-            const float32x4_t r2 = vmulq_f32(vf_2, vd);
-            const float32x4_t r3 = vmulq_f32(vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*V1_QK4_0 + l +  0, r0);
-            vst1q_f32(y + i*V1_QK4_0 + l +  4, r1);
-            vst1q_f32(y + i*V1_QK4_0 + l +  8, r2);
-            vst1q_f32(y + i*V1_QK4_0 + l + 12, r3);
-        }
-    }
-#else
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
-
-            y[i*V1_QK4_0 + l + 0] = v0;
-            y[i*V1_QK4_0 + l + 1] = v1;
-
-            assert(!isnan(y[i*V1_QK4_0 + l + 0]));
-            assert(!isnan(y[i*V1_QK4_0 + l + 1]));
-        }
-    }
-#endif
-}
-
-void ggml_vec_dot_v1_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_0;
-
-    assert(n % V1_QK8_0 == 0);
-    assert(nb % 2 == 0);
-
-    const block_v1_q4_0 * restrict x = vx;
-    const block_v1_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v1_q4_0 * restrict x0 = &x[i + 0];
-        const block_v1_q4_0 * restrict x1 = &x[i + 1];
-        const block_v1_q8_0 * restrict y0 = &y[i + 0];
-        const block_v1_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-#define WORK(I) \
-    /* Compute combined scale for the block */ \
-    const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[I].d ), _mm256_broadcast_ss( &y[I].d ) ); \
-    __m256i bx = bytes_from_nibbles_32(x[I].qs); \
-    /* Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */ \
-    const __m256i off = _mm256_set1_epi8( 8 ); \
-    bx = _mm256_sub_epi8( bx, off ); \
-    __m256i by = _mm256_loadu_si256((const __m256i *)y[I].qs); \
-    const __m256 q = mul_sum_i8_pairs_float(bx, by); \
-    /* Multiply q with scale and accumulate */ \
-    acc = _mm256_fmadd_ps( d, q, acc )
-    int i = 0;
-    for (; i + 12 < nb; i += 12) {
-        _mm_prefetch(x+i+12, 3);
-        _mm_prefetch(x+i+15, 3);
-        _mm_prefetch(x+i+18, 3);
-        _mm_prefetch(x+i+21, 3);
-        _mm_prefetch(y+i+12, 3);
-        _mm_prefetch(y+i+14, 3);
-        _mm_prefetch(y+i+16, 3);
-        _mm_prefetch(y+i+18, 3);
-        _mm_prefetch(y+i+20, 3);
-        _mm_prefetch(y+i+22, 3);
-        for (int j = 0; j < 12; ++j) {
-            WORK(i+j);
-        }
-    }
-    for (; i < nb; ++i) {
-        WORK(i);
-    }
-#undef WORK
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m128i i32[2];
-        for (int j = 0; j < 2; ++j) {
-            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
-            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
-            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
-
-            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-            const __m128i off = _mm_set1_epi8( 8 );
-            bx = _mm_sub_epi8( bx, off );
-
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
-
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
-
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
-
-            const __m128i ones = _mm_set1_epi16(1);
-            i32[j] = _mm_madd_epi16(ones, dot);
-        }
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        int sumi = 0;
-        for (int j = 0; j < V1_QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const int i0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1 = (int8_t) (v0 >>   4) - 8;
-
-            const int i2 = p1[2*j + 0];
-            const int i3 = p1[2*j + 1];
-
-            sumi += i0*i2 + i1*i3;
-        }
-        sumf += d0*d1*sumi;
-    }
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q4_0.h
+++ b/third_party/ggml/ggjt.v1.q4_0.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_
-COSMOPOLITAN_C_START_
-
-#define V1_QK4_0 32
-typedef struct {
-  float d;                   // delta
-  uint8_t qs[V1_QK4_0 / 2];  // nibbles / quants
-} block_v1_q4_0;
-
-void dequantize_row_v1_q4_0(const void* restrict, float* restrict, int);
-size_t ggml_quantize_v1_q4_0(const float*, void*, int, int, int64_t*);
-void quantize_row_v1_q4_0(const float* restrict, void* restrict, int);
-void quantize_row_v1_q4_0_reference(const float* restrict,
-                                    block_v1_q4_0* restrict, int);
-void ggml_vec_dot_v1_q4_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_0_H_ */
--- a/third_party/ggml/ggjt.v1.q4_1.c
+++ b/third_party/ggml/ggjt.v1.q4_1.c
@ -1,471 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q4_1.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q4_1.h"
-#include "third_party/ggml/ggjt.v1.q8_1.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-
-// quantization for the ggjt.v1.q4_1 file format
-
-static_assert(sizeof(block_v1_q4_1) == 2 * sizeof(float) + V1_QK4_1 / 2,
-              "wrong q4_1 block size/padding");
-static_assert(sizeof(block_v1_q8_1) == 3*sizeof(float) + V1_QK8_1,
-              "wrong q8_1 block size/padding");
-
-void quantize_row_v1_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK4_1 == 0);
-    const int nb = k / V1_QK4_1;
-
-    block_v1_q4_1 * restrict y = vy;
-
-    uint8_t pp[V1_QK4_1/2];
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < V1_QK4_1; l++) {
-            const float v = x[i*V1_QK4_1 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        for (int l = 0; l < V1_QK4_1; l += 2) {
-            const float v0 = (x[i*V1_QK4_1 + l + 0] - min)*id;
-            const float v1 = (x[i*V1_QK4_1 + l + 1] - min)*id;
-
-            const uint8_t vi0 = roundf(v0);
-            const uint8_t vi1 = roundf(v1);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-void quantize_row_v1_q4_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK4_1 == 0);
-
-    const int nb = k / V1_QK4_1;
-
-    block_v1_q4_1 * restrict y = vy;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 vmax;
-        vmax = _mm256_max_ps( v0, v1 );
-        vmax = _mm256_max_ps( vmax, v2 );
-        vmax = _mm256_max_ps( vmax, v3 );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 vmin;
-        vmin = _mm256_min_ps( v0, v1 );
-        vmin = _mm256_min_ps( vmin, v2 );
-        vmin = _mm256_min_ps( vmin, v3 );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float d = (maxScalar - minScalar) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].m = minScalar;
-        y[i].d = d;
-
-        // x = (x-min)*id
-        const __m256 mul = _mm256_set1_ps( id );
-        const __m256 off = _mm256_set1_ps( minScalar );
-        v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul );
-        v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul );
-        v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul );
-        v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[8];
-        float32x4_t minv[8];
-        float32x4_t maxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*V1_QK4_1 + 4*l);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
-
-        const float min = vminvq_f32(minv[0]);
-        const float max = vmaxvq_f32(maxv[0]);
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        const float32x4_t minv0 = vdupq_n_f32(min);
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_v1_q4_1_reference(x, vy, k);
-#endif
-}
-
-size_t ggml_quantize_v1_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK4_1 == 0);
-    const int nb = k / V1_QK4_1;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q4_1 * restrict y = (block_v1_q4_1 *)dst + j/V1_QK4_1;
-
-        quantize_row_v1_q4_1_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < V1_QK4_1; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V1_QK4_1*sizeof(block_v1_q4_1));
-}
-
-void dequantize_row_v1_q4_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK4_1 == 0);
-    const int nb = k / V1_QK4_1;
-
-    const block_v1_q4_1 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-        const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_1; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale, add m and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
-                _mm256_storeu_ps(y + i * V1_QK4_1 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-        const float32x4_t vm = vdupq_n_f32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_1; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Interleave and combine
-            const uint8x8_t vx_0 = vzip1_u8(v0, v1);
-            const uint8x8_t vx_1 = vzip2_u8(v0, v1);
-
-            const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
-
-            // convert to 2x uint16x8_t
-            const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
-            const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
-
-            // multiply by d and add m
-            const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
-            const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
-            const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
-            const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*V1_QK4_1 + l +  0, r0);
-            vst1q_f32(y + i*V1_QK4_1 + l +  4, r1);
-            vst1q_f32(y + i*V1_QK4_1 + l +  8, r2);
-            vst1q_f32(y + i*V1_QK4_1 + l + 12, r3);
-        }
-    }
-#else
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-        const float m = x[i].m;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*V1_QK4_1 + l + 0] = v0;
-            y[i*V1_QK4_1 + l + 1] = v1;
-
-            assert(!isnan(y[i*V1_QK4_1 + l + 0]));
-            assert(!isnan(y[i*V1_QK4_1 + l + 1]));
-        }
-    }
-#endif
-}
-
-void ggml_vec_dot_v1_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_1;
-
-    assert(n % V1_QK8_1 == 0);
-    assert(nb % 2 == 0);
-
-    const block_v1_q4_1 * restrict x = vx;
-    const block_v1_q8_1 * restrict y = vy;
-
-    // TODO: add AVX / WASM SIMD / etc
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v1_q4_1 * restrict x0 = &x[i + 0];
-        const block_v1_q4_1 * restrict x1 = &x[i + 1];
-        const block_v1_q8_1 * restrict y0 = &y[i + 0];
-        const block_v1_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float * d0 = &x[i].d;
-        const float * d1 = &y[i].d;
-
-        summs += x[i].m * (y[i].s0 + y[i].s1);
-
-        const __m256 d0v = _mm256_broadcast_ss( d0 );
-        const __m256 d1v = _mm256_broadcast_ss( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float m0 = x[i].m;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        // TODO: this is very slow ..
-        for (int j = 0; j < V1_QK8_1/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const float f0 = d0*(v0 & 0x0F) + m0;
-            const float f1 = d0*(v0 >>   4) + m0;
-
-            const float f2 = d1*p1[2*j + 0];
-            const float f3 = d1*p1[2*j + 1];
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q4_1.h
+++ b/third_party/ggml/ggjt.v1.q4_1.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_
-COSMOPOLITAN_C_START_
-
-#define V1_QK4_1 32
-typedef struct {
-  float d;                   // delta
-  float m;                   // min
-  uint8_t qs[V1_QK4_1 / 2];  // nibbles / quants
-} block_v1_q4_1;
-
-void dequantize_row_v1_q4_1(const void* restrict, float* restrict, int);
-size_t ggml_quantize_v1_q4_1(const float*, void*, int, int, int64_t*);
-void quantize_row_v1_q4_1(const float* restrict, void* restrict, int);
-void ggml_vec_dot_v1_q4_1_q8_1(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v1_q4_1_reference(const float* restrict, void* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V1_Q4_1_H_ */
--- a/third_party/ggml/ggjt.v1.q4_2.c
+++ b/third_party/ggml/ggjt.v1.q4_2.c
@ -1,296 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q4_2.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-
-static_assert(sizeof(block_v1_q4_2) == sizeof(ggml_fp16_t) + V1_QK4_2 / 2,
-              "wrong q4_2 block size/padding");
-
-// reference implementation for deterministic creation of model files
-void quantize_row_v1_q4_2_reference(const float * restrict x, block_v1_q4_2 * restrict y, int k) {
-    assert(k % V1_QK4_2 == 0);
-
-    const int nb = k / V1_QK4_2;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < V1_QK4_2; l++) {
-            const float v = x[i*V1_QK4_2 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -8;
-
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int l = 0; l < V1_QK4_2; l += 2) {
-            const float v0 = x[i*V1_QK4_2 + l + 0]*id;
-            const float v1 = x[i*V1_QK4_2 + l + 1]*id;
-
-            const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
-            const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-    }
-}
-
-void quantize_row_v1_q4_2(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK4_2 == 0);
-
-    block_v1_q4_2 * restrict y = vy;
-
-    quantize_row_v1_q4_2_reference(x, y, k);
-}
-
-size_t ggml_quantize_v1_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK4_2 == 0);
-    const int nb = k / V1_QK4_2;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q4_2 * restrict y = (block_v1_q4_2 *)dst + j/V1_QK4_2;
-
-        quantize_row_v1_q4_2_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < V1_QK4_2; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V1_QK4_2*sizeof(block_v1_q4_2));
-}
-
-void dequantize_row_v1_q4_2(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK4_2 == 0);
-    const int nb = k / V1_QK4_2;
-
-    const block_v1_q4_2 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK4_2; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            y[i*V1_QK4_2 + l + 0] = v0;
-            y[i*V1_QK4_2 + l + 1] = v1;
-
-            assert(!isnan(y[i*V1_QK4_2 + l + 0]));
-            assert(!isnan(y[i*V1_QK4_2 + l + 1]));
-        }
-    }
-}
-
-void ggml_vec_dot_v1_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_0;
-
-    assert(n % V1_QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(V1_QK8_0 == 2*V1_QK4_2);
-
-    const block_v1_q4_2 * restrict x = vx;
-    const block_v1_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v1_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_v1_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
-        const block_v1_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
-        const block_v1_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
-
-        const block_v1_q8_0 * restrict y0 = &y[i + 0];
-        const block_v1_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-        const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
-
-        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8(8);
-        bx = _mm256_sub_epi8(bx, off);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-
-        int sumi_0 = 0;
-        int sumi_1 = 0;
-
-        for (int j = 0; j < V1_QK8_0/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int i0_0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1_0 = (int8_t) (v0 >>   4) - 8;
-
-            const int i0_1 = (int8_t) (v1 & 0x0F) - 8;
-            const int i1_1 = (int8_t) (v1 >>   4) - 8;
-
-            const int i2_0 = y0[2*j + 0];
-            const int i3_0 = y0[2*j + 1];
-
-            const int i2_1 = y0[2*(j + V1_QK8_0/4) + 0];
-            const int i3_1 = y0[2*(j + V1_QK8_0/4) + 1];
-
-            sumi_0 += i0_0*i2_0 + i1_0*i3_0;
-            sumi_1 += i0_1*i2_1 + i1_1*i3_1;
-        }
-
-        sumf += (d0 * y[i].d) * sumi_0;
-        sumf += (d1 * y[i].d) * sumi_1;
-    }
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q4_2.h
+++ b/third_party/ggml/ggjt.v1.q4_2.h
@ -1,21 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_
-#include "third_party/ggml/fp16.h"
-COSMOPOLITAN_C_START_
-
-#define V1_QK4_2 16
-typedef struct {
-  ggml_fp16_t d;             // delta
-  uint8_t qs[V1_QK4_2 / 2];  // nibbles / quants
-} block_v1_q4_2;
-
-void dequantize_row_v1_q4_2(const void* restrict, float* restrict, int);
-void quantize_row_v1_q4_2(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v1_q4_2(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v1_q4_2_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v1_q4_2_reference(const float* restrict,
-                                    block_v1_q4_2* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q4_2_H_ */
--- a/third_party/ggml/ggjt.v1.q5_0.c
+++ b/third_party/ggml/ggjt.v1.q5_0.c
@ -1,349 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q5_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-
-static_assert(sizeof(block_v1_q5_0) ==
-                  sizeof(ggml_fp16_t) + sizeof(uint32_t) + V1_QK5_0 / 2,
-              "wrong q5_0 block size/padding");
-
-void quantize_row_v1_q5_0_reference(const float * restrict x, block_v1_q5_0 * restrict y, int k) {
-    assert(k % V1_QK5_0 == 0);
-    const int nb = k / V1_QK5_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < V1_QK5_0; l++) {
-            const float v = x[i*V1_QK5_0 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int l = 0; l < V1_QK5_0; l += 2) {
-            const float v0 = x[i*V1_QK5_0 + l + 0]*id;
-            const float v1 = x[i*V1_QK5_0 + l + 1]*id;
-
-            const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f));
-            const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f));
-
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-void quantize_row_v1_q5_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK5_0 == 0);
-
-    block_v1_q5_0 * restrict y = vy;
-
-    quantize_row_v1_q5_0_reference(x, y, k);
-}
-
-size_t ggml_quantize_v1_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK5_0 == 0);
-    const int nb = k / V1_QK5_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q5_0 * restrict y = (block_v1_q5_0 *)dst + j/V1_QK5_0;
-
-        quantize_row_v1_q5_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int l = 0; l < V1_QK5_0; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V1_QK5_0*sizeof(block_v1_q5_0));
-}
-
-void dequantize_row_v1_q5_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK5_0 == 0);
-    const int nb = k / V1_QK5_0;
-
-    const block_v1_q5_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int l = 0; l < V1_QK5_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const int8_t vi0 = (vi & 0x0F) | vh0;
-            const int8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = (vi0 - 16)*d;
-            const float v1 = (vi1 - 16)*d;
-
-            y[i*V1_QK5_0 + l + 0] = v0;
-            y[i*V1_QK5_0 + l + 1] = v1;
-
-            assert(!isnan(y[i*V1_QK5_0 + l + 0]));
-            assert(!isnan(y[i*V1_QK5_0 + l + 1]));
-        }
-    }
-}
-
-void ggml_vec_dot_v1_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_0;
-
-    assert(n % V1_QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(V1_QK8_0 == V1_QK5_0);
-
-    const block_v1_q5_0 * restrict x = vx;
-    const block_v1_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_v1_q5_0 * restrict x0 = &x[i];
-        const block_v1_q8_0 * restrict y0 = &y[i];
-
-        const uint8x16_t m4b  = vdupq_n_u8(0x0F);
-        const int8x16_t  s16b = vdupq_n_s8(0x10);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
-
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, m4b));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
-
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
-        // add high bit and sub 16
-        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b);
-        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b);
-
-        // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_v1_q5_0 * restrict x0 = &x[i];
-        const block_v1_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-        const v128_t s16b = wasm_i8x16_splat(0x10);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-        // add high bit and sub 16
-        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
-        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        int sxy = 0;
-
-        for (int j = 0; j < V1_QK8_0/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
-
-            const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
-            const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            sxy += x0_0*y0_0 + x1_0*y1_0;
-        }
-
-        sumf += (d*sxy)*y[i].d;
-    }
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q5_0.h
+++ b/third_party/ggml/ggjt.v1.q5_0.h
@ -1,22 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_
-#include "third_party/ggml/fp16.h"
-COSMOPOLITAN_C_START_
-
-#define V1_QK5_0 32
-typedef struct {
-  ggml_fp16_t d;             // delta
-  uint8_t qh[4];             // 5-th bit of quants
-  uint8_t qs[V1_QK5_0 / 2];  // nibbles / quants
-} block_v1_q5_0;
-
-void dequantize_row_v1_q5_0(const void* restrict, float* restrict, int);
-void quantize_row_v1_q5_0(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v1_q5_0(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v1_q5_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v1_q5_0_reference(const float* restrict,
-                                    block_v1_q5_0* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_0_H_ */
--- a/third_party/ggml/ggjt.v1.q5_1.c
+++ b/third_party/ggml/ggjt.v1.q5_1.c
@ -1,351 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q5_1.h"
-#include "libc/assert.h"
-#include "libc/math.h"
-#include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q8_1.h"
-
-static_assert(sizeof(block_v1_q5_1) ==
-                  2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + V1_QK5_1 / 2,
-              "wrong q5_1 block size/padding");
-
-void quantize_row_v1_q5_1_reference(const float * restrict x, block_v1_q5_1 * restrict y, int k) {
-    assert(k % V1_QK5_1 == 0);
-    const int nb = k / V1_QK5_1;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < V1_QK5_1; l++) {
-            const float v = x[i*V1_QK5_1 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int l = 0; l < V1_QK5_1; l += 2) {
-            const float v0 = (x[i*V1_QK5_1 + l + 0] - min)*id;
-            const float v1 = (x[i*V1_QK5_1 + l + 1] - min)*id;
-
-            const uint32_t vi0 = (int) (v0 + 0.5f);
-            const uint32_t vi1 = (int) (v1 + 0.5f);
-
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-void quantize_row_v1_q5_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK5_1 == 0);
-
-    block_v1_q5_1 * restrict y = vy;
-
-    quantize_row_v1_q5_1_reference(x, y, k);
-}
-
-size_t ggml_quantize_v1_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK5_1 == 0);
-    const int nb = k / V1_QK5_1;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q5_1 * restrict y = (block_v1_q5_1 *)dst + j/V1_QK5_1;
-
-        quantize_row_v1_q5_1_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int l = 0; l < V1_QK5_1; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V1_QK5_1*sizeof(block_v1_q5_1));
-}
-
-void dequantize_row_v1_q5_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK5_1 == 0);
-    const int nb = k / V1_QK5_1;
-
-    const block_v1_q5_1 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int l = 0; l < V1_QK5_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const uint8_t vi0 = (vi & 0x0F) | vh0;
-            const uint8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*V1_QK5_1 + l + 0] = v0;
-            y[i*V1_QK5_1 + l + 1] = v1;
-
-            assert(!isnan(y[i*V1_QK5_1 + l + 0]));
-            assert(!isnan(y[i*V1_QK5_1 + l + 1]));
-        }
-    }
-}
-
-void ggml_vec_dot_v1_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_1;
-
-    assert(n % V1_QK8_1 == 0);
-    assert(nb % 2 == 0);
-    assert(V1_QK8_1 == V1_QK5_1);
-
-    const block_v1_q5_1 * restrict x = vx;
-    const block_v1_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
-
-    float summs = 0.0f;
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_v1_q5_1 * restrict x0 = &x[i];
-        const block_v1_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
-
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, vdupq_n_u8(0x0F)));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
-
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
-        // add
-        const int8x16_t v0lf = vorrq_s8(v0lz, qhl);
-        const int8x16_t v0hf = vorrq_s8(v0hz, qhh);
-
-        // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv) + summs;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_v1_q5_1 * restrict x0 = &x[i];
-        const block_v1_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        static bool x = true;
-
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
-        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        int sxy = 0;
-
-        for (int j = 0; j < V1_QK8_1/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
-
-            const int x0_0 = (v0 & 0x0F) | x0_0h;
-            const int x1_0 = (v0 >>   4) | x1_0h;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            sxy += x0_0*y0_0 + x1_0*y1_0;
-        }
-
-        sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1);
-    }
-
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q5_1.h
+++ b/third_party/ggml/ggjt.v1.q5_1.h
@ -1,23 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_
-#include "third_party/ggml/fp16.h"
-COSMOPOLITAN_C_START_
-
-#define V1_QK5_1 32
-typedef struct {
-  ggml_fp16_t d;             // delta
-  ggml_fp16_t m;             // min
-  uint8_t qh[4];             // 5-th bit of quants
-  uint8_t qs[V1_QK5_1 / 2];  // nibbles / quants
-} block_v1_q5_1;
-
-void dequantize_row_v1_q5_1(const void* restrict, float* restrict, int);
-void quantize_row_v1_q5_1(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v1_q5_1(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v1_q5_1_q8_1(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v1_q5_1_reference(const float* restrict,
-                                    block_v1_q5_1* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q5_1_H_ */
--- a/third_party/ggml/ggjt.v1.q8_0.c
+++ b/third_party/ggml/ggjt.v1.q8_0.c
@ -1,324 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/ggjt.v1.internal.h"
-#include "third_party/ggml/ggjt.v1.q8_0.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-
-static_assert(sizeof(block_v1_q8_0) == sizeof(float) + V1_QK8_0,
-              "wrong q8_0 block size/padding");
-
-// reference implementation for deterministic creation of model files
-void quantize_row_v1_q8_0_reference(const float * restrict x, block_v1_q8_0 * restrict y, int k) {
-    assert(k % V1_QK8_0 == 0);
-    const int nb = k / V1_QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < V1_QK8_0; l++) {
-            const float v = x[i*V1_QK8_0 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < V1_QK8_0; ++l) {
-            const float v0 = x[i*V1_QK8_0 + l]*id;
-
-            y[i].qs[l] = roundf(v0);
-        }
-    }
-}
-
-void quantize_row_v1_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(V1_QK8_0 == 32);
-    assert(k % V1_QK8_0 == 0);
-    const int nb = k / V1_QK8_0;
-
-    block_v1_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_v1_q8_0_reference(x, y, k);
-#endif
-}
-
-size_t ggml_quantize_v1_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V1_QK8_0 == 0);
-    const int nb = k / V1_QK8_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_v1_q8_0 * restrict y = (block_v1_q8_0 *)dst + j/V1_QK8_0;
-
-        quantize_row_v1_q8_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < V1_QK8_0; ++l) {
-                const int8_t vi = y[i].qs[l];
-
-                hist[vi/16 + 8]++;
-            }
-        }
-    }
-
-    return (n/V1_QK8_0*sizeof(block_v1_q8_0));
-}
-
-void dequantize_row_v1_q8_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % V1_QK8_0 == 0);
-    const int nb = k / V1_QK8_0;
-
-    const block_v1_q8_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        const int8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < V1_QK8_0; ++l) {
-            y[i*V1_QK8_0 + l] = pp[l]*d;
-        }
-    }
-}
-
-void ggml_vec_dot_v1_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / V1_QK8_0;
-
-    assert(n % V1_QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(V1_QK8_0 == V1_QK8_0);
-
-    const block_v1_q8_0 * restrict x = vx;
-    const block_v1_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v1_q8_0 * restrict x0 = &x[i + 0];
-        const block_v1_q8_0 * restrict x1 = &x[i + 1];
-        const block_v1_q8_0 * restrict y0 = &y[i + 0];
-        const block_v1_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
-
-#else
-        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
-        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
-        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
-        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
-        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        const int8_t * restrict x0 = x[i].qs;
-        const int8_t * restrict y0 = y[i].qs;
-
-        int sumi = 0;
-
-        for (int j = 0; j < V1_QK8_0; j++) {
-            const int v0 = x0[j];
-            const int v1 = y0[j];
-
-            sumi += v0*v1;
-        }
-
-        sumf += (x[i].d*y[i].d)*sumi;
-    }
-
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q8_0.h
+++ b/third_party/ggml/ggjt.v1.q8_0.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_
-COSMOPOLITAN_C_START_
-
-#define V1_QK8_0 32
-typedef struct {
-  float d;              // delta
-  int8_t qs[V1_QK8_0];  // quants
-} block_v1_q8_0;
-
-void dequantize_row_v1_q8_0(const void* restrict, float* restrict, int);
-void quantize_row_v1_q8_0(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v1_q8_0(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v1_q8_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v1_q8_0_reference(const float* restrict,
-                                    block_v1_q8_0* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_0_H_ */
--- a/third_party/ggml/ggjt.v1.q8_1.c
+++ b/third_party/ggml/ggjt.v1.q8_1.c
@ -1,239 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v1.q8_1.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/ggml/ggml.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "third_party/libcxx/math.h"
-
-static_assert(sizeof(block_v1_q8_1) == 3 * sizeof(float) + V1_QK8_1,
-              "wrong q8_1 block size/padding");
-
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-#endif /* AVX || AVX2 || AVX512F */
-
-// reference implementation for deterministic creation of model files
-void quantize_row_v1_q8_1_reference(const float * restrict x, block_v1_q8_1 * restrict y, int k) {
-    assert(V1_QK8_1 == 32);
-    assert(k % V1_QK8_1 == 0);
-    const int nb = k / V1_QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < V1_QK8_1; l++) {
-            const float v = x[i*V1_QK8_1 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum0 = 0;
-        int sum1 = 0;
-
-        for (int l = 0; l < V1_QK8_1/2; ++l) {
-            const float v0 = x[i*V1_QK8_1           + l]*id;
-            const float v1 = x[i*V1_QK8_1 + V1_QK8_1/2 + l]*id;
-
-            y[i].qs[          l] = roundf(v0);
-            y[i].qs[V1_QK8_1/2 + l] = roundf(v1);
-
-            sum0 += y[i].qs[          l];
-            sum1 += y[i].qs[V1_QK8_1/2 + l];
-        }
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-}
-
-void quantize_row_v1_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V1_QK8_1 == 0);
-    const int nb = k / V1_QK8_1;
-
-    block_v1_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv0 = vdupq_n_s32(0);
-        int32x4_t accv1 = vdupq_n_s32(0);
-
-        // low half
-        for (int l = 0; l < 4; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv0 = vaddq_s32(accv0, vi);
-        }
-
-        // high half
-        for (int l = 4; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv1 = vaddq_s32(accv1, vi);
-        }
-
-        const int32_t sum0 = vaddvq_s32(accv0);
-        const int32_t sum1 = vaddvq_s32(accv1);
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-        y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
-        y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s0 = d * hsum_i32_4(s0);
-        y[i].s1 = d * hsum_i32_4(s1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_v1_q8_1_reference(x, y, k);
-#endif
-}
--- a/third_party/ggml/ggjt.v1.q8_1.h
+++ b/third_party/ggml/ggjt.v1.q8_1.h
@ -1,18 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_
-COSMOPOLITAN_C_START_
-
-#define V1_QK8_1 32
-typedef struct {
-  float d;              // delta
-  float s0;             // d * sum(qs[i]) low
-  float s1;             // d * sum(qs[i]) high
-  int8_t qs[V1_QK8_1];  // quants
-} block_v1_q8_1;
-
-void quantize_row_v1_q8_1(const float* restrict, void* restrict, int);
-void quantize_row_v1_q8_1_reference(const float* restrict,
-                                    block_v1_q8_1* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V1_Q8_1_H_ */
--- a/third_party/ggml/ggjt.v2.c
+++ b/third_party/ggml/ggjt.v2.c
@ -1,151 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2023 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "third_party/ggml/ggjt.v2.q4_0.h"
-#include "third_party/ggml/ggjt.v2.q4_1.h"
-#include "third_party/ggml/ggjt.v2.q5_0.h"
-#include "third_party/ggml/ggjt.v2.q5_1.h"
-#include "third_party/ggml/ggjt.v2.q8_0.h"
-#include "third_party/ggml/ggjt.v2.q8_1.h"
-#include "third_party/ggml/ggml.h"
-
-static const int ggjt_v2_blck_size[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = V2_QK4_0,
-    [GGML_TYPE_Q4_1] = V2_QK4_1,
-    [GGML_TYPE_Q5_0] = V2_QK5_0,
-    [GGML_TYPE_Q5_1] = V2_QK5_1,
-    [GGML_TYPE_Q8_0] = V2_QK8_0,
-    [GGML_TYPE_Q8_1] = V2_QK8_1,
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-
-static const size_t ggjt_v2_type_size[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_v2_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_v2_q4_1),
-    [GGML_TYPE_Q5_0] = sizeof(block_v2_q5_0),
-    [GGML_TYPE_Q5_1] = sizeof(block_v2_q5_1),
-    [GGML_TYPE_Q8_0] = sizeof(block_v2_q8_0),
-    [GGML_TYPE_Q8_1] = sizeof(block_v2_q8_1),
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-
-static const char *const ggjt_v2_type_name[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q4_2] = "q4_2",
-    [GGML_TYPE_Q5_0] = "q5_0",
-    [GGML_TYPE_Q5_1] = "q5_1",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-
-static const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q5_0] = true,
-    [GGML_TYPE_Q5_1] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_Q8_1] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-
-static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = (void *)ggml_quantize_v2_q4_0,
-    [GGML_TYPE_Q4_1] = (void *)ggml_quantize_v2_q4_1,
-    [GGML_TYPE_Q5_0] = (void *)ggml_quantize_v2_q5_0,
-    [GGML_TYPE_Q5_1] = (void *)ggml_quantize_v2_q5_1,
-    [GGML_TYPE_Q8_0] = (void *)ggml_quantize_v2_q8_0,
-};
-
-static const quantize_fns_t ggjt_v2_quantize_fns[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = dequantize_row_v2_q4_0,
-        .quantize_row_q           = quantize_row_v2_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_0_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v2_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = dequantize_row_v2_q4_1,
-        .quantize_row_q           = quantize_row_v2_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_1,
-        .vec_dot_q                = ggml_vec_dot_v2_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .dequantize_row_q         = dequantize_row_v2_q5_0,
-        .quantize_row_q           = quantize_row_v2_q5_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_0_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v2_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .dequantize_row_q         = dequantize_row_v2_q5_1,
-        .quantize_row_q           = quantize_row_v2_q5_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q5_1_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_1,
-        .vec_dot_q                = ggml_vec_dot_v2_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = dequantize_row_v2_q8_0,
-        .quantize_row_q           = quantize_row_v2_q8_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_0_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_0,
-        .vec_dot_q                = ggml_vec_dot_v2_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .dequantize_row_q         = NULL,   // TODO
-        .quantize_row_q           = quantize_row_v2_q8_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_v2_q8_1_reference,
-        .quantize_row_q_dot       = quantize_row_v2_q8_1,
-        .vec_dot_q                = NULL,   // TODO
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-};
-
-void ggjt_v2(void) {
-    GGML_BLCK_SIZE = ggjt_v2_blck_size;
-    GGML_TYPE_SIZE = ggjt_v2_type_size;
-    GGML_TYPE_NAME = ggjt_v2_type_name;
-    GGML_IS_QUANTIZED = ggjt_v2_is_quantized;
-    quantize_fns = ggjt_v2_quantize_fns;
-    GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk;
-}
--- a/third_party/ggml/ggjt.v2.internal.h
+++ b/third_party/ggml/ggjt.v2.internal.h
@ -1,150 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_
-#include "libc/str/str.h"
-#include "third_party/aarch64/arm_neon.internal.h"
-#include "third_party/intel/immintrin.internal.h"
-COSMOPOLITAN_C_START_
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-  __m128 res = _mm256_extractf128_ps(x, 1);
-  res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-  res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-  res = _mm_add_ss(res, _mm_movehdup_ps(res));
-  return _mm_cvtss_f32(res);
-}
-#endif
-
-#if __AVX2__ || __AVX512F__
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t *x) {
-  uint32_t x32;
-  memcpy(&x32, x, sizeof(uint32_t));
-  const __m256i shuf_mask =
-      _mm256_set_epi64x(0x0303030303030303, 0x0202020202020202,
-                        0x0101010101010101, 0x0000000000000000);
-  __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-  const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-  bytes = _mm256_or_si256(bytes, bit_mask);
-  return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-  const __m256i ones = _mm256_set1_epi16(1);
-  const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-  return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax,
-                                             const __m256i sy) {
-#if __AVXVNNI__
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-  return _mm256_cvtepi32_ps(summed_pairs);
-#else
-  // Perform multiplication and create 16-bit values
-  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-  return sum_i16_pairs_float(dot);
-#endif
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#if __AVXVNNIINT8__
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
-  return _mm256_cvtepi32_ps(summed_pairs);
-#else
-  // Get absolute values of x vectors
-  const __m256i ax = _mm256_sign_epi8(x, x);
-  // Sign the values of the y vectors
-  const __m256i sy = _mm256_sign_epi8(y, x);
-  return mul_sum_us8_pairs_float(ax, sy);
-#endif
-}
-
-static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) {
-  const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-  const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
-  const __m256i lowMask = _mm256_set1_epi8(0xF);
-  return _mm256_and_si256(lowMask, bytes);
-}
-
-#elif defined(__AVX__)
-
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t *x) {
-  uint32_t x32;
-  memcpy(&x32, x, sizeof(uint32_t));
-  const __m128i shuf_maskl =
-      _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
-  const __m128i shuf_maskh =
-      _mm_set_epi64x(0x0303030303030303, 0x0202020202020202);
-  __m128i bytesl = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskl);
-  __m128i bytesh = _mm_shuffle_epi8(_mm_set1_epi32(x32), shuf_maskh);
-  const __m128i bit_mask = _mm_set1_epi64x(0x7fbfdfeff7fbfdfe);
-  bytesl = _mm_or_si128(bytesl, bit_mask);
-  bytesh = _mm_or_si128(bytesh, bit_mask);
-  bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
-  bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
-  return _mm256_set_m128i(bytesh, bytesl);
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
-  const __m128i ones = _mm_set1_epi16(1);
-  const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
-  const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
-  const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
-  return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-static inline __m256 mul_sum_us8_pairs_float(const __m256i ax,
-                                             const __m256i sy) {
-  const __m128i axl = _mm256_castsi256_si128(ax);
-  const __m128i axh = _mm256_extractf128_si256(ax, 1);
-  const __m128i syl = _mm256_castsi256_si128(sy);
-  const __m128i syh = _mm256_extractf128_si256(sy, 1);
-  // Perform multiplication and create 16-bit values
-  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-  const __m128i doth = _mm_maddubs_epi16(axh, syh);
-  return sum_i16_pairs_float(doth, dotl);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-  const __m128i xl = _mm256_castsi256_si128(x);
-  const __m128i xh = _mm256_extractf128_si256(x, 1);
-  const __m128i yl = _mm256_castsi256_si128(y);
-  const __m128i yh = _mm256_extractf128_si256(y, 1);
-  // Get absolute values of x vectors
-  const __m128i axl = _mm_sign_epi8(xl, xl);
-  const __m128i axh = _mm_sign_epi8(xh, xh);
-  // Sign the values of the y vectors
-  const __m128i syl = _mm_sign_epi8(yl, xl);
-  const __m128i syh = _mm_sign_epi8(yh, xh);
-  // Perform multiplication and create 16-bit values
-  const __m128i dotl = _mm_maddubs_epi16(axl, syl);
-  const __m128i doth = _mm_maddubs_epi16(axh, syh);
-  return sum_i16_pairs_float(doth, dotl);
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) {
-  // Load 16 bytes from memory
-  __m128i tmpl = _mm_loadu_si128((const __m128i *)rsi);
-  __m128i tmph = _mm_srli_epi16(tmpl, 4);
-  const __m128i lowMask = _mm_set1_epi8(0xF);
-  tmpl = _mm_and_si128(lowMask, tmpl);
-  tmph = _mm_and_si128(lowMask, tmph);
-  return _mm256_set_m128i(tmph, tmpl);
-}
-
-#endif /* AVX */
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_INTERNAL_H_ */
--- a/third_party/ggml/ggjt.v2.q4_0.c
+++ b/third_party/ggml/ggjt.v2.q4_0.c
@ -1,395 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q4_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/math.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-#include "third_party/ggml/ggjt.v2.q8_0.h"
-
-static_assert(sizeof(block_v2_q4_0) == sizeof(float) + V2_QK4_0 / 2,
-              "wrong q4_0 block size/padding");
-
-void dequantize_row_v2_q4_0(const void * restrict x_, float * restrict y, int k) {
-    const block_v2_q4_0 * restrict x = x_;
-    static const int qk = V2_QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F) - 8;
-            const int x1 = (x[i].qs[j] >>   4) - 8;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-size_t ggml_quantize_v2_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V2_QK4_0 == 0);
-    const int nb = k / V2_QK4_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_v2_q4_0 * restrict y = (block_v2_q4_0 *) dst + b/V2_QK4_0;
-
-        quantize_row_v2_q4_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < V2_QK4_0; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V2_QK4_0*sizeof(block_v2_q4_0));
-}
-
-void quantize_row_v2_q4_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_v2_q4_0_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-void quantize_row_v2_q4_0_reference(const float * restrict x, block_v2_q4_0 * restrict y, int k) {
-    static const int qk = V2_QK4_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
-
-void ggml_vec_dot_v2_q4_0_q8_0(const int n,
-                               float * restrict s,
-                               const void * restrict vx,
-                               const void * restrict vy) {
-    const int qk = V2_QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-
-    const block_v2_q4_0 * restrict x = vx;
-    const block_v2_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v2_q4_0 * restrict x0 = &x[i + 0];
-        const block_v2_q4_0 * restrict x1 = &x[i + 1];
-        const block_v2_q8_0 * restrict y0 = &y[i + 0];
-        const block_v2_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        const __m128i lowMask = _mm_set1_epi8(0xF);
-        const __m128i off = _mm_set1_epi8(8);
-
-        const __m128i tmp = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx = _mm_and_si128(lowMask, tmp);
-        __m128i by = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx, by);
-
-        bx = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
-        by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx = _mm_sub_epi8(bx, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
-
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__SSSE3__)
-    // set constants
-    const __m128i lowMask = _mm_set1_epi8(0xF);
-    const __m128i off = _mm_set1_epi8(8);
-
-    // Initialize accumulator with zeros
-    __m128 acc_0 = _mm_setzero_ps();
-    __m128 acc_1 = _mm_setzero_ps();
-    __m128 acc_2 = _mm_setzero_ps();
-    __m128 acc_3 = _mm_setzero_ps();
-
-    // First round without accumulation
-    {
-        _mm_prefetch(&x[0] + sizeof(block_v2_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[0] + sizeof(block_v2_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[0].d ), _mm_set1_ps( y[0].d ) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[0].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[0].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[1] + sizeof(block_v2_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[1] + sizeof(block_v2_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[1].d ), _mm_set1_ps( y[1].d ) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        acc_0 = _mm_mul_ps( d_0_1, p0 );
-        acc_1 = _mm_mul_ps( d_0_1, p1 );
-        acc_2 = _mm_mul_ps( d_2_3, p2 );
-        acc_3 = _mm_mul_ps( d_2_3, p3 );
-    }
-
-    // Main loop
-    for (int i = 2; i < nb; i+=2) {
-        _mm_prefetch(&x[i] + sizeof(block_v2_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + sizeof(block_v2_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_mul_ps( _mm_set1_ps( x[i].d ), _mm_set1_ps( y[i].d ) );
-
-        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);
-
-        __m128i bx_0 = _mm_and_si128(lowMask, tmp_0_1);
-        __m128i by_0 = _mm_loadu_si128((const __m128i *)y[i].qs);
-        bx_0 = _mm_sub_epi8(bx_0, off);
-        const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
-
-        __m128i bx_1 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_0_1, 4));
-        __m128i by_1 = _mm_loadu_si128((const __m128i *)(y[i].qs + 16));
-        bx_1 = _mm_sub_epi8(bx_1, off);
-        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);
-
-        _mm_prefetch(&x[i] + 2 * sizeof(block_v2_q4_0), _MM_HINT_T0);
-        _mm_prefetch(&y[i] + 2 * sizeof(block_v2_q8_0), _MM_HINT_T0);
-
-        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_mul_ps( _mm_set1_ps( x[i + 1].d ), _mm_set1_ps( y[i + 1].d ) );
-
-        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);
-
-        __m128i bx_2 = _mm_and_si128(lowMask, tmp_2_3);
-        __m128i by_2 = _mm_loadu_si128((const __m128i *)y[i + 1].qs);
-        bx_2 = _mm_sub_epi8(bx_2, off);
-        const __m128i i32_2 = mul_sum_i8_pairs(bx_2, by_2);
-
-        __m128i bx_3 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp_2_3, 4));
-        __m128i by_3 = _mm_loadu_si128((const __m128i *)(y[i + 1].qs + 16));
-        bx_3 = _mm_sub_epi8(bx_3, off);
-        const __m128i i32_3 = mul_sum_i8_pairs(bx_3, by_3);
-
-        // Convert int32_t to float
-        __m128 p0 = _mm_cvtepi32_ps(i32_0);
-        __m128 p1 = _mm_cvtepi32_ps(i32_1);
-        __m128 p2 = _mm_cvtepi32_ps(i32_2);
-        __m128 p3 = _mm_cvtepi32_ps(i32_3);
-
-        // Apply the scale
-        __m128 p0_d = _mm_mul_ps( d_0_1, p0 );
-        __m128 p1_d = _mm_mul_ps( d_0_1, p1 );
-        __m128 p2_d = _mm_mul_ps( d_2_3, p2 );
-        __m128 p3_d = _mm_mul_ps( d_2_3, p3 );
-
-        // Acummulate
-        acc_0 = _mm_add_ps(p0_d, acc_0);
-        acc_1 = _mm_add_ps(p1_d, acc_1);
-        acc_2 = _mm_add_ps(p2_d, acc_2);
-        acc_3 = _mm_add_ps(p3_d, acc_3);
-    }
-
-    *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F) - 8;
-            const int v1 = (x[i].qs[j] >>   4) - 8;
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (x[i].d*y[i].d)*sumi;
-    }
-
-    *s = sumf;
-#endif
-}
--- a/third_party/ggml/ggjt.v2.q4_0.h
+++ b/third_party/ggml/ggjt.v2.q4_0.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_
-COSMOPOLITAN_C_START_
-
-#define V2_QK4_0 32
-typedef struct {
-  float d;                   // delta
-  uint8_t qs[V2_QK4_0 / 2];  // nibbles / quants
-} block_v2_q4_0;
-
-void dequantize_row_v2_q4_0(const void* restrict, float* restrict, int);
-size_t ggml_quantize_v2_q4_0(const float*, void*, int, int, int64_t*);
-void quantize_row_v2_q4_0(const float* restrict, void* restrict, int);
-void quantize_row_v2_q4_0_reference(const float* restrict,
-                                    block_v2_q4_0* restrict, int);
-void ggml_vec_dot_v2_q4_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_0_H_ */
--- a/third_party/ggml/ggjt.v2.q4_1.c
+++ b/third_party/ggml/ggjt.v2.q4_1.c
@ -1,252 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q4_1.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/math.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-#include "third_party/ggml/ggjt.v2.q8_1.h"
-
-static_assert(sizeof(block_v2_q4_1) == 2 * sizeof(float) + V2_QK4_1 / 2,
-              "wrong q4_1 block size/padding");
-
-void dequantize_row_v2_q4_1(const void * restrict x_, float * restrict y, int k) {
-    const block_v2_q4_1 * restrict x = x_;
-    static const int qk = V2_QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-        const float m = x[i].m;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int x0 = (x[i].qs[j] & 0x0F);
-            const int x1 = (x[i].qs[j] >>   4);
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-size_t ggml_quantize_v2_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V2_QK4_1 == 0);
-    const int nb = k / V2_QK4_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_v2_q4_1 * restrict y = (block_v2_q4_1 *) dst + b/V2_QK4_1;
-
-        quantize_row_v2_q4_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < V2_QK4_1; j += 2) {
-                const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[j/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V2_QK4_1*sizeof(block_v2_q4_1));
-}
-
-void quantize_row_v2_q4_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_v2_q4_1_reference(x, y, k);
-}
-
-void ggml_vec_dot_v2_q4_1_q8_1(const int n,
-                               float * restrict s,
-                               const void * restrict vx,
-                               const void * restrict vy) {
-    const int qk = V2_QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-
-    const block_v2_q4_1 * restrict x = vx;
-    const block_v2_q8_1 * restrict y = vy;
-
-    // TODO: add WASM SIMD
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v2_q4_1 * restrict x0 = &x[i + 0];
-        const block_v2_q4_1 * restrict x1 = &x[i + 1];
-        const block_v2_q8_1 * restrict y0 = &y[i + 0];
-        const block_v2_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += x0->m * y0->s + x1->m * y1->s;
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float * d0 = &x[i].d;
-        const float * d1 = &y[i].d;
-
-        summs += x[i].m * y[i].s;
-
-        const __m256 d0v = _mm256_broadcast_ss( d0 );
-        const __m256 d1v = _mm256_broadcast_ss( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_us8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d0d1, xy ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const int v0 = (x[i].qs[j] & 0x0F);
-            const int v1 = (x[i].qs[j] >>   4);
-
-            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (x[i].d*y[i].d)*sumi + x[i].m*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void quantize_row_v2_q4_1_reference(const float * restrict x, void * restrict y_, int k) {
-    block_v2_q4_1 * restrict y = y_;
-    const int qk = V2_QK4_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
-            const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
-
-            y[i].qs[j]  = xi0;
-            y[i].qs[j] |= xi1 << 4;
-        }
-    }
-}
--- a/third_party/ggml/ggjt.v2.q4_1.h
+++ b/third_party/ggml/ggjt.v2.q4_1.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_
-COSMOPOLITAN_C_START_
-
-#define V2_QK4_1 32
-typedef struct {
-  float d;                   // delta
-  float m;                   // min
-  uint8_t qs[V2_QK4_1 / 2];  // nibbles / quants
-} block_v2_q4_1;
-
-void dequantize_row_v2_q4_1(const void* restrict, float* restrict, int);
-size_t ggml_quantize_v2_q4_1(const float*, void*, int, int, int64_t*);
-void quantize_row_v2_q4_1(const float* restrict, void* restrict, int);
-void ggml_vec_dot_v2_q4_1_q8_1(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v2_q4_1_reference(const float* restrict, void* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_V2_Q4_1_H_ */
--- a/third_party/ggml/ggjt.v2.q5_0.c
+++ b/third_party/ggml/ggjt.v2.q5_0.c
@ -1,392 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q5_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/math.h"
-#include "libc/str/str.h"
-#include "third_party/ggml/fp16.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-#include "third_party/ggml/ggjt.v2.q8_0.h"
-
-static_assert(sizeof(block_v2_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_0 / 2,
-              "wrong q5_0 block size/padding");
-
-void dequantize_row_v2_q5_0(const void * restrict x_, float * restrict y, int k) {
-    const block_v2_q5_0 * restrict x = x_;
-    static const int qk = V2_QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            y[i*qk + j + 0   ] = x0*d;
-            y[i*qk + j + qk/2] = x1*d;
-        }
-    }
-}
-
-void quantize_row_v2_q5_0(const float * restrict x, void * restrict y, int k) {
-    quantize_row_v2_q5_0_reference(x, y, k);
-}
-
-size_t ggml_quantize_v2_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V2_QK5_0 == 0);
-    const int nb = k / V2_QK5_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_v2_q5_0 * restrict y = (block_v2_q5_0 *)dst + b/V2_QK5_0;
-
-        quantize_row_v2_q5_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < V2_QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V2_QK5_0*sizeof(block_v2_q5_0));
-}
-
-void ggml_vec_dot_v2_q5_0_q8_0(const int n,
-                               float * restrict s,
-                               const void * restrict vx,
-                               const void * restrict vy) {
-    const int qk = V2_QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-    assert(qk == V2_QK5_0);
-
-    const block_v2_q5_0 * restrict x = vx;
-    const block_v2_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v2_q5_0 * restrict x0 = &x[i];
-        const block_v2_q5_0 * restrict x1 = &x[i + 1];
-        const block_v2_q8_0 * restrict y0 = &y[i];
-        const block_v2_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        // extract the 5th bit via lookup table ((!b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_1[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_1[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_1[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_1[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_1[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_1[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const int8x16_t v0_0lf = vsubq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vsubq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vsubq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vsubq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-        const float x1d = GGML_FP16_TO_FP32(x1->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_v2_q5_0 * restrict x0 = &x[i];
-        const block_v2_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-        const v128_t s16b = wasm_i8x16_splat(0x10);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_1[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_1[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_1[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_1[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // add high bit and sub 16 (equivalent to sub 0x10 when bit is zero)
-        const v128_t v0lf = wasm_i8x16_sub(v0l, qhl);
-        const v128_t v0hf = wasm_i8x16_sub(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8((char)0xF0);
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_andnot_si128(bxhil, mask);
-        bxhih = _mm_andnot_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = _mm256_set_m128i(bxh, bxl);
-
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_add_ps(_mm256_mul_ps(d, q), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-            const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
-            const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void quantize_row_v2_q5_0_reference(const float * restrict x, block_v2_q5_0 * restrict y, int k) {
-    static const int qk = V2_QK5_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max  = 0.0f;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max  = v;
-            }
-        }
-
-        const float d  = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 0    + j]*id;
-            const float x1 = x[i*qk + qk/2 + j]*id;
-
-            const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
-            const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(qh));
-    }
-}
--- a/third_party/ggml/ggjt.v2.q5_0.h
+++ b/third_party/ggml/ggjt.v2.q5_0.h
@ -1,22 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_
-#include "third_party/ggml/fp16.h"
-COSMOPOLITAN_C_START_
-
-#define V2_QK5_0 32
-typedef struct {
-  ggml_fp16_t d;             // delta
-  uint8_t qh[4];             // 5-th bit of quants
-  uint8_t qs[V2_QK5_0 / 2];  // nibbles / quants
-} block_v2_q5_0;
-
-void dequantize_row_v2_q5_0(const void* restrict, float* restrict, int);
-void quantize_row_v2_q5_0(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v2_q5_0(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v2_q5_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v2_q5_0_reference(const float* restrict,
-                                    block_v2_q5_0* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_0_H_ */
--- a/third_party/ggml/ggjt.v2.q5_1.c
+++ b/third_party/ggml/ggjt.v2.q5_1.c
@ -1,408 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q5_1.h"
-#include "libc/assert.h"
-#include "libc/math.h"
-#include "libc/str/str.h"
-#include "third_party/ggml/fp16.internal.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-#include "third_party/ggml/ggjt.v2.q8_1.h"
-
-static_assert(sizeof(block_v2_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + V2_QK5_1 / 2,
-              "wrong q5_1 block size/padding");
-
-void dequantize_row_v2_q5_1(const void * restrict x_, float * restrict y, int k) {
-    const block_v2_q5_1 * restrict x = x_;
-    static const int qk = V2_QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
-            const int x1 = (x[i].qs[j] >>   4) | xh_1;
-
-            y[i*qk + j + 0   ] = x0*d + m;
-            y[i*qk + j + qk/2] = x1*d + m;
-        }
-    }
-}
-
-void quantize_row_v2_q5_1(const float * restrict x, void * restrict y, int k) {
-    quantize_row_v2_q5_1_reference(x, y, k);
-}
-
-size_t ggml_quantize_v2_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V2_QK5_1 == 0);
-    const int nb = k / V2_QK5_1;
-
-    for (int b = 0; b < n; b += k) {
-        block_v2_q5_1 * restrict y = (block_v2_q5_1 *)dst + b/V2_QK5_1;
-
-        quantize_row_v2_q5_1_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int j = 0; j < V2_QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[j/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/V2_QK5_1*sizeof(block_v2_q5_1));
-}
-
-void ggml_vec_dot_v2_q5_1_q8_1(const int n,
-                               float * restrict s,
-                               const void * restrict vx,
-                               const void * restrict vy) {
-    const int qk = V2_QK8_1;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-    assert(qk == V2_QK5_1);
-
-    const block_v2_q5_1 * restrict x = vx;
-    const block_v2_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs0 = 0.0f;
-    float summs1 = 0.0f;
-
-    uint32_t qh0;
-    uint32_t qh1;
-
-    uint64_t tmp0[4];
-    uint64_t tmp1[4];
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v2_q5_1 * restrict x0 = &x[i];
-        const block_v2_q5_1 * restrict x1 = &x[i + 1];
-        const block_v2_q8_1 * restrict y0 = &y[i];
-        const block_v2_q8_1 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
-        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
-
-        // extract the 5th bit via lookup table ((b) << 4)
-        memcpy(&qh0, x0->qh, sizeof(qh0));
-        memcpy(&qh1, x1->qh, sizeof(qh1));
-
-        tmp0[0] = table_b2b_0[(qh0 >>  0) & 0xFF];
-        tmp0[1] = table_b2b_0[(qh0 >>  8) & 0xFF];
-        tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF];
-        tmp0[3] = table_b2b_0[(qh0 >> 24)       ];
-
-        tmp1[0] = table_b2b_0[(qh1 >>  0) & 0xFF];
-        tmp1[1] = table_b2b_0[(qh1 >>  8) & 0xFF];
-        tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF];
-        tmp1[3] = table_b2b_0[(qh1 >> 24)       ];
-
-        const int8x16_t qhl0 = vld1q_s8((const int8_t *)(tmp0 + 0));
-        const int8x16_t qhh0 = vld1q_s8((const int8_t *)(tmp0 + 2));
-        const int8x16_t qhl1 = vld1q_s8((const int8_t *)(tmp1 + 0));
-        const int8x16_t qhh1 = vld1q_s8((const int8_t *)(tmp1 + 2));
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // add high bit
-        const int8x16_t v0_0lf = vorrq_s8(v0_0l, qhl0);
-        const int8x16_t v0_0hf = vorrq_s8(v0_0h, qhh0);
-        const int8x16_t v0_1lf = vorrq_s8(v0_1l, qhl1);
-        const int8x16_t v0_1hf = vorrq_s8(v0_1h, qhh1);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-        const float x1d = GGML_FP16_TO_FP32(x1->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), x0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), x1d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint32_t qh;
-    uint64_t tmp[4];
-
-    // TODO: check if unrolling this is better
-    for (int i = 0; i < nb; ++i) {
-        const block_v2_q5_1 * restrict x0 = &x[i];
-        const block_v2_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_0[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_0[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_0[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_0[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        static bool x = true;
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0l, qhl);
-        const v128_t v0hf = wasm_v128_or(v0h, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    __m128i mask = _mm_set1_epi8(0x10);
-
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        __m128i bxhil = _mm256_castsi256_si128(bxhi);
-        __m128i bxhih = _mm256_extractf128_si256(bxhi, 1);
-        bxhil = _mm_and_si128(bxhil, mask);
-        bxhih = _mm_and_si128(bxhih, mask);
-        __m128i bxl = _mm256_castsi256_si128(bx);
-        __m128i bxh = _mm256_extractf128_si256(bx, 1);
-        bxl = _mm_or_si128(bxl, bxhil);
-        bxh = _mm_or_si128(bxh, bxhih);
-        bx = _mm256_set_m128i(bxh, bxl);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_us8_pairs_float(bx, by);
-
-        acc = _mm256_add_ps(_mm256_mul_ps(q, _mm256_mul_ps(dx, dy)), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        int sumi = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
-            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
-
-            const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
-            const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
-
-            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
-        }
-
-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void quantize_row_v2_q5_1_reference(const float * restrict x, block_v2_q5_1 * restrict y, int k) {
-    const int qk = V2_QK5_1;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int j = 0; j < qk; j++) {
-            const float v = x[i*qk + j];
-
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 0    + j] - min)*id;
-            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
-
-            const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
-            const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
-
-            y[i].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((xi0 & 0x10) >> 4) << (j + 0);
-            qh |= ((xi1 & 0x10) >> 4) << (j + qk/2);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
--- a/third_party/ggml/ggjt.v2.q5_1.h
+++ b/third_party/ggml/ggjt.v2.q5_1.h
@ -1,23 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_
-#include "third_party/ggml/fp16.h"
-COSMOPOLITAN_C_START_
-
-#define V2_QK5_1 32
-typedef struct {
-  ggml_fp16_t d;             // delta
-  ggml_fp16_t m;             // min
-  uint8_t qh[4];             // 5-th bit of quants
-  uint8_t qs[V2_QK5_1 / 2];  // nibbles / quants
-} block_v2_q5_1;
-
-void dequantize_row_v2_q5_1(const void* restrict, float* restrict, int);
-void quantize_row_v2_q5_1(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v2_q5_1(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v2_q5_1_q8_1(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v2_q5_1_reference(const float* restrict,
-                                    block_v2_q5_1* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q5_1_H_ */
--- a/third_party/ggml/ggjt.v2.q8_0.c
+++ b/third_party/ggml/ggjt.v2.q8_0.c
@ -1,331 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q8_0.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/math.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-
-static_assert(sizeof(block_v2_q8_0) == sizeof(float) + V2_QK8_0,
-              "wrong q8_0 block size/padding");
-
-void dequantize_row_v2_q8_0(const void * restrict vx, float * restrict y, int k) {
-    static const int qk = V2_QK8_0;
-
-    assert(k % qk == 0);
-
-    const int nb = k / qk;
-
-    const block_v2_q8_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        for (int j = 0; j < qk; ++j) {
-            y[i*qk + j] = x[i].qs[j]*d;
-        }
-    }
-}
-
-void quantize_row_v2_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(V2_QK8_0 == 32);
-    assert(k % V2_QK8_0 == 0);
-    const int nb = k / V2_QK8_0;
-
-    block_v2_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-size_t ggml_quantize_v2_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % V2_QK8_0 == 0);
-    const int nb = k / V2_QK8_0;
-
-    for (int b = 0; b < n; b += k) {
-        block_v2_q8_0 * restrict y = (block_v2_q8_0 *)dst + b/V2_QK8_0;
-
-        quantize_row_v2_q8_0_reference(src + b, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int j = 0; j < V2_QK8_0; ++j) {
-                const int8_t vi = y[i].qs[j];
-
-                hist[vi/16 + 8]++;
-            }
-        }
-    }
-
-    return (n/V2_QK8_0*sizeof(block_v2_q8_0));
-}
-
-void ggml_vec_dot_v2_q8_0_q8_0(const int n,
-                               float * restrict s,
-                               const void * restrict vx,
-                               const void * restrict vy) {
-    const int qk = V2_QK8_0;
-    const int nb = n / qk;
-
-    assert(n % qk == 0);
-    assert(nb % 2 == 0);
-
-    const block_v2_q8_0 * restrict x = vx;
-    const block_v2_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_v2_q8_0 * restrict x0 = &x[i + 0];
-        const block_v2_q8_0 * restrict x1 = &x[i + 1];
-        const block_v2_q8_0 * restrict y0 = &y[i + 0];
-        const block_v2_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
-
-#else
-        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
-        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
-        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
-        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
-        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__) || defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-#if defined(__AVX2__)
-        acc = _mm256_fmadd_ps( d, q, acc );
-#else
-        acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
-#endif
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        int sumi = 0;
-
-        for (int j = 0; j < qk; j++) {
-            sumi += x[i].qs[j]*y[i].qs[j];
-        }
-
-        sumf += (x[i].d*y[i].d)*sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-void quantize_row_v2_q8_0_reference(const float * restrict x, block_v2_q8_0 * restrict y, int k) {
-    assert(k % V2_QK8_0 == 0);
-    const int nb = k / V2_QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < V2_QK8_0; j++) {
-            const float v = x[i*V2_QK8_0 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int j = 0; j < V2_QK8_0; ++j) {
-            const float x0 = x[i*V2_QK8_0 + j]*id;
-
-            y[i].qs[j] = roundf(x0);
-        }
-    }
-}
--- a/third_party/ggml/ggjt.v2.q8_0.h
+++ b/third_party/ggml/ggjt.v2.q8_0.h
@ -1,20 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_
-COSMOPOLITAN_C_START_
-
-#define V2_QK8_0 32
-typedef struct {
-  float d;              // delta
-  int8_t qs[V2_QK8_0];  // quants
-} block_v2_q8_0;
-
-void dequantize_row_v2_q8_0(const void* restrict, float* restrict, int);
-void quantize_row_v2_q8_0(const float* restrict, void* restrict, int);
-size_t ggml_quantize_v2_q8_0(const float*, void*, int, int, int64_t*);
-void ggml_vec_dot_v2_q8_0_q8_0(const int, float* restrict, const void* restrict,
-                               const void* restrict);
-void quantize_row_v2_q8_0_reference(const float* restrict,
-                                    block_v2_q8_0* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_0_H_ */
--- a/third_party/ggml/ggjt.v2.q8_1.c
+++ b/third_party/ggml/ggjt.v2.q8_1.c
@ -1,220 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8                               :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggjt.v2.q8_1.h"
-#include "libc/assert.h"
-#include "libc/macros.internal.h"
-#include "libc/math.h"
-#include "third_party/ggml/ggjt.v2.internal.h"
-
-static_assert(sizeof(block_v2_q8_1) == 2*sizeof(float) + V2_QK8_1,
-              "wrong q8_1 block size/padding");
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-#endif
-
-void quantize_row_v2_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % V2_QK8_1 == 0);
-    const int nb = k / V2_QK8_1;
-
-    block_v2_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int j = 0; j < 8; j++) srcv[j]  = vld1q_f32(x + i*32 + 4*j);
-        for (int j = 0; j < 8; j++) asrcv[j] = vabsq_f32(srcv[j]);
-
-        for (int j = 0; j < 4; j++) amaxv[2*j] = vmaxq_f32(asrcv[2*j], asrcv[2*j+1]);
-        for (int j = 0; j < 2; j++) amaxv[4*j] = vmaxq_f32(amaxv[4*j], amaxv[4*j+2]);
-        for (int j = 0; j < 1; j++) amaxv[8*j] = vmaxq_f32(amaxv[8*j], amaxv[8*j+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv = vdupq_n_s32(0);
-
-        for (int j = 0; j < 8; j++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[j], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
-
-            accv = vaddq_s32(accv, vi);
-        }
-
-        y[i].s = d * vaddvq_s32(accv);
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_q8_1_reference(x, y, k);
-#endif
-}
-
-void quantize_row_v2_q8_1_reference(const float * restrict x, block_v2_q8_1 * restrict y, int k) {
-    assert(V2_QK8_1 == 32);
-    assert(k % V2_QK8_1 == 0);
-    const int nb = k / V2_QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int j = 0; j < V2_QK8_1; j++) {
-            const float v = x[i*V2_QK8_1 + j];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum = 0;
-
-        for (int j = 0; j < V2_QK8_1/2; ++j) {
-            const float v0 = x[i*V2_QK8_1           + j]*id;
-            const float v1 = x[i*V2_QK8_1 + V2_QK8_1/2 + j]*id;
-
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[V2_QK8_1/2 + j] = roundf(v1);
-
-            sum += y[i].qs[          j];
-            sum += y[i].qs[V2_QK8_1/2 + j];
-        }
-
-        y[i].s = d * sum;
-    }
-}
--- a/third_party/ggml/ggjt.v2.q8_1.h
+++ b/third_party/ggml/ggjt.v2.q8_1.h
@ -1,17 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_
-COSMOPOLITAN_C_START_
-
-#define V2_QK8_1 32
-typedef struct {
-  float d;              // delta
-  float s;              // d * sum(qs[i])
-  int8_t qs[V2_QK8_1];  // quants
-} block_v2_q8_1;
-
-void quantize_row_v2_q8_1(const float* restrict, void* restrict, int);
-void quantize_row_v2_q8_1_reference(const float* restrict,
-                                    block_v2_q8_1* restrict, int);
-
-COSMOPOLITAN_C_END_
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_GGJT_V2_Q8_1_H_ */
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
--- a/third_party/ggml/ggml.h
+++ b/third_party/ggml/ggml.h
--- a/third_party/ggml/llama.cc
+++ b/third_party/ggml/llama.cc
--- a/third_party/ggml/llama.h
+++ b/third_party/ggml/llama.h
@ -1,276 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef LLAMA_H
-#define LLAMA_H
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
-#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
-#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
-#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
-
-#define LLAMA_FILE_VERSION           3
-#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
-#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
-#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION        1
-
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
-#define LLAMA_SUPPORTS_GPU_OFFLOAD
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
-    } llama_token_data;
-
-    typedef struct llama_token_data_array {
-        llama_token_data * data;
-        size_t size;
-        bool sorted;
-    } llama_token_data_array;
-
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;        // text context
-        int n_parts;      // -1 for default
-        int n_gpu_layers; // number of layers to store in VRAM
-        int seed;         // RNG seed, -1 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    // model file types
-    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32              = 0,
-        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2          = 5, // except 1d tensors
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    LLAMA_API bool llama_mmap_supported();
-    LLAMA_API bool llama_mlock_supported();
-
-    // TODO: not great API - very likely to change
-    // Initialize the llama + ggml backend
-    // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
-
-    LLAMA_API int64_t llama_time_us();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params,
-                                    int   verbose);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
-
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
-    // Returns 0 on success
-    LLAMA_API int llama_apply_lora_from_file(
-            struct llama_context * ctx,
-                      const char * path_lora,
-                      const char * path_base_model,
-                             int   n_threads);
-
-    // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-
-    // Returns the maximum size in bytes of the state (rng, logits, embedding
-    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
-
-    // Copies the state to the specified destination address.
-    // Destination needs to have allocated enough memory.
-    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
-
-    // Set the state reading from the specified address
-    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
-
-    // Returns number of bytes in the longest token string.
-    LLAMA_API int llama_longest_token(const struct llama_context * ctx);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-    LLAMA_API llama_token llama_token_nl();
-
-    // Sampling functions
-
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
-
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
-
-    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
-
-    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
-
-    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
-
-    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
-
-    /// @details Selects the token with the highest probability.
-    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    /// @details Randomly selects a token from the candidates based on their probabilities.
-    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef LLAMA_API_INTERNAL
-
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif
-
-#endif // LLAMA_H
--- a/third_party/ggml/llama_util.h
+++ b/third_party/ggml/llama_util.h
@ -1,396 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef LLAMA_UTIL_H
-#define LLAMA_UTIL_H
-#include "libc/calls/struct/rlimit.h"
-#include "libc/dce.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/madv.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/rlimit.h"
-#include "third_party/libcxx/cerrno"
-#include "third_party/libcxx/climits"
-#include "third_party/libcxx/cstdarg"
-#include "third_party/libcxx/cstdint"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstdlib"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
-#define LLAMA_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((__format__(__gnu_printf__, 1, 2)))
-#else
-__attribute__((__format__(__printf__, 1, 2)))
-#endif
-__attribute__((__noreturn__))
-#endif
-static void Die(const char *fmt, ...) {
-    va_list va;
-    va_start(va, fmt);
-    vfprintf(stderr, fmt, va);
-    va_end(va);
-    fputc('\n', stderr);
-    exit(1);
-}
-
-static inline bool is_integer_str(const char *s) {
-    if (*s == '-') ++s;
-    if (!*s) return false;
-    while (isdigit(*s)) ++s;
-    return !*s;
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            Die("failed to open %s: %s", fname, std::strerror(errno));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        LLAMA_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            Die("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            Die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            Die("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-#if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-struct llama_mmap {
-    void * addr;
-    size_t size;
-
-    llama_mmap(const llama_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-#if defined(__linux__) || defined(__COSMOPOLITAN__)
-        flags |= MAP_POPULATE;
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            Die("mmap failed: %s", strerror(errno));
-        }
-
-        if (prefetch && !IsWindows()) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        munmap(addr, size);
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
-
-        if (hMapping == NULL) {
-            Die("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            Die("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
-        }
-
-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
-            WIN32_MEMORY_RANGE_ENTRY range;
-            range.VirtualAddress = addr;
-            range.NumberOfBytes = (SIZE_T)size;
-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-            }
-        }
-        #else
-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
-    }
-
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    llama_mmap(struct llama_file *) {
-        Die("mmap not supported");
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-    bool failed_already = false;
-
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
-
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * addr) {
-        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
-    }
-
-    void grow_to(size_t target_size) {
-        LLAMA_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) {
-        if (!mlock(addr, size)) {
-            return true;
-        } else {
-            char* errmsg = std::strerror(errno);
-            bool suggest = (errno == ENOMEM);
-
-            // Check if the resource limit is fine after all
-            struct rlimit lock_limit;
-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
-                suggest = false;
-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
-                suggest = false;
-
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-            return false;
-        }
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * addr, size_t size) {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    void raw_lock(const void * addr, size_t size) {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-    }
-
-    void raw_unlock(const void * addr, size_t size) {}
-#endif
-};
-
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct llama_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        free(addr);
-        addr = (uint8_t *)memalign(32, size); // [jart] always avx align
-        this->size = size;
-    }
-
-    ~llama_buffer() {
-        free(addr);
-    }
-};
-#endif
--- a/third_party/ggml/main.cc
+++ b/third_party/ggml/main.cc
@ -1,972 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Justine Alexandra Roberts Tunney                         │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/sched_param.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/stat.h"
-#include "libc/log/log.h"
-#include "libc/macros.internal.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/sysv/consts/ioprio.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/msync.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prio.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/sig.h"
-#include "third_party/ggml/common.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/atomic"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "libc/serialize.h"
-#include "third_party/libcxx/vector"
-
-#define EPHEMERAL(fmt) "\r\e[K\033[1;35m" fmt " \033[0m"
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-static gpt_params params;
-static llama_context * ctx;
-static console_state con_st;
-
-static int n_past;
-static int n_remain;
-static int n_consumed;
-static bool input_noecho;
-
-////////////////////////////////////////////////////////////////////////////////
-
-static std::atomic<bool> is_stalled;
-static std::atomic<bool> is_terminated;
-static std::atomic<bool> is_interacting;
-
-static void acknowledge_shutdown(void) {
-    write(2, "^C", 2);
-}
-
-static void sigint_handler_batch(int signo) {
-    is_terminated = true;
-    acknowledge_shutdown();
-}
-
-static void sigint_handler_interactive(int signo) {
-    if (!is_interacting && !is_stalled) {
-        is_interacting = true;
-    } else {
-        is_terminated = true;
-        acknowledge_shutdown();
-    }
-}
-
-static int CompareTime(struct timespec a, struct timespec b) {
-  int cmp;
-  if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec))) {
-    cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec);
-  }
-  return cmp;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// ux explanatory logging for llama.com developers
-
-#if 0
-#define DEVLOG(...) (void)0
-#else
-#define DEVLOG(...) if (g_devlog) fprintf(g_devlog, __VA_ARGS__)
-static FILE *g_devlog;
-__attribute__((__constructor__)) static void init(void) {
-    char path[PATH_MAX];
-    static char linebuf[4096];
-    snprintf(path, sizeof(path), "/tmp/llama-%s.log", getenv("USER"));
-    if ((g_devlog = fopen(path, "wa"))) {
-        setvbuf(g_devlog, linebuf, _IOLBF, sizeof(linebuf));
-    }
-}
-#endif
-
-////////////////////////////////////////////////////////////////////////////////
-
-enum jtlp_status {
-    kPromptPending,
-    kPromptCompleted,
-    kPromptFinished
-};
-
-struct jtlp_header {
-    uint8_t magic[4];
-    uint8_t version[4];
-    uint8_t state_size[8];
-    uint8_t model_dev[8];
-    uint8_t model_ino[8];
-    uint8_t model_mtim_sec[8];
-    uint8_t model_mtim_nsec[8];
-    uint8_t prompt_size[8];
-};
-
-constexpr uint32_t kJtlpMagic = 'j' | 't' << 8 | 'l' << 16 | 'p' << 24;
-constexpr uint32_t kJtlpVersion = 0;
-
-static std::string last_output;
-static std::vector<llama_token> last_n_tokens;
-static std::string::size_type longest_antiprompt;
-static enum jtlp_status prompt_status = kPromptPending;
-
-static void remember_init() {
-    last_output.clear();
-    last_n_tokens.resize(llama_n_ctx(ctx), 0);
-    for (std::string & antiprompt : params.antiprompt) {
-        longest_antiprompt = std::max(longest_antiprompt, antiprompt.size());
-    }
-    longest_antiprompt += llama_longest_token(ctx) * 2;
-}
-
-static void remember_token(llama_token tok,
-                           bool is_user_input = false) {
-    last_n_tokens.erase(last_n_tokens.begin());
-    last_n_tokens.push_back(tok);
-    if (!is_user_input) {
-        last_output.append(llama_token_to_str(ctx, tok));
-        if (last_output.size() > longest_antiprompt) {
-            last_output.erase(0, last_output.size() - longest_antiprompt);
-        }
-    }
-    DEVLOG("remember_token(%`'s, %d) -> %`'s\n",
-           llama_token_to_str(ctx, tok), is_user_input,
-           last_output.c_str());
-}
-
-static bool has_antiprompt(std::string::size_type *out_index = nullptr,
-                           std::string *out_antiprompt = nullptr) {
-    for (std::string & antiprompt : params.antiprompt) {
-        std::string::size_type index = last_output.rfind(antiprompt);
-        if (index != std::string::npos) {
-            if (out_index) *out_index = index;
-            if (out_antiprompt) *out_antiprompt = antiprompt;
-            DEVLOG("found antiprompt %`'s at index %d of %`'s\n",
-                   antiprompt.c_str(), (int)index, last_output.c_str());
-            return true;
-        }
-    }
-    return false;
-}
-
-static void finish_initializing_prompt() {
-    prompt_status = kPromptFinished;
-    if (params.interactive) {
-        std::string::size_type ap_index;
-        is_interacting = true;
-        if (has_antiprompt(&ap_index)) {
-            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            printf("%s", last_output.substr(ap_index).c_str());
-            fflush(stdout);
-        }
-        console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-    }
-    last_output.clear();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static int on_missing_feature(const char *name) {
-    fprintf(stderr, "%s: error: cpuid %s not detected\n", __func__, name);
-    fprintf(stderr, "%s: amd microprocessors made after 2017 usually work\n", __func__);
-    fprintf(stderr, "%s: intel microprocessors made after 2013 usually work\n", __func__);
-    return 1;
-}
-
-int main(int argc, char ** argv) {
-
-    verynice();
-    ShowCrashReports();
-
-    setvbuf(stdin, NULL, _IONBF, 0);
-    setvbuf(stdout, NULL, _IONBF, 0);
-    setvbuf(stderr, NULL, _IONBF, 0);
-
-    params.model = "models/llama-7B/ggml-model.bin";
-
-#ifdef __x86_64__
-    if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
-    if (!X86_HAVE(AVX)) return on_missing_feature("avx");
-    if (!X86_HAVE(FMA)) return on_missing_feature("fma");
-    if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
-    if (!X86_HAVE(F16C)) return on_missing_feature("f16c");
-#endif /* __x86_64__ */
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    con_st.use_color = params.use_color;
-
-    con_st.multiline_input = params.multiline_input;
-    console_init(con_st);
-    atexit([]() { console_cleanup(con_st); });
-
-    if (params.perplexity) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.verbose > 0) {
-        fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-    }
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
-
-    struct stat model_stat;
-
-    // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    stat(params.model.c_str(), &model_stat);
-
-    if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(ctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
-    }
-
-    // print system information
-    if (params.verbose > 0) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            const std::vector<llama_token> tmp(params.n_batch, 0);
-            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
-
-        {
-            const std::vector<llama_token> tmp = { 0, };
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
-        }
-
-        if (params.verbose > 0) {
-            llama_print_timings(ctx);
-        }
-        llama_free(ctx);
-
-        return 0;
-    }
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    // params.prompt.insert(0, 1, ' ');
-
-    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-
-    const int n_ctx = llama_n_ctx(ctx);
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // number of tokens to keep when resetting context
-    int n_keep = params.n_keep;
-    if (n_keep < 0 || n_keep > (int)embd_inp.size() || params.instruct) {
-        n_keep = (int)embd_inp.size();
-    }
-    if (!n_keep && !params.n_keep_str.empty()) {
-        auto pivot = ::llama_tokenize(ctx, params.n_keep_str, false);
-        auto pos = std::search(embd_inp.begin(), embd_inp.end(),
-                               pivot.begin(), pivot.end());
-        if (pos == embd_inp.end()) {
-            fprintf(stderr, "%s: error: --n_keep %`'s substring not found within prompt\n",
-                    __func__, params.n_keep_str.c_str());
-            return 1;
-        }
-        n_keep = (pos - embd_inp.begin()) + (pivot.end() - pivot.begin());
-    }
-
-    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_first = true;
-        params.antiprompt.push_back("### Instruction:\n\n");
-    }
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
-    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d %6d -> %`'s\n", i, embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "%s: first part of prompt: \"", __func__);
-        for (int i = 0; i < n_keep; i++) {
-            fprintf(stderr, "%'s", llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "\"\n");
-        fprintf(stderr, "%s: second part of prompt: \"", __func__);
-        for (int i = n_keep; i < (int)embd_inp.size(); i++) {
-            fprintf(stderr, "%'s", llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "\"\n");
-        fprintf(stderr, "\n");
-    }
-
-    // setup ctrl-c handler
-    struct sigaction sa;
-    sa.sa_flags = 0;
-    sigemptyset(&sa.sa_mask);
-    if (params.interactive) {
-        sa.sa_handler = sigint_handler_interactive;
-    } else {
-        sa.sa_handler = sigint_handler_batch;
-    }
-    sigaction(SIGINT, &sa, NULL);
-
-    if (params.interactive) {
-        if (params.verbose > 0) {
-            fprintf(stderr, "%s: interactive mode on.\n", __func__);
-        }
-
-        if (params.verbose > 0 && params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
-            }
-        }
-
-        if (params.verbose > 0 && !params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-    }
-
-    if (params.verbose > 0) {
-        fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-                params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-        fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n",
-                n_ctx, params.n_batch, params.n_predict, n_keep);
-        fprintf(stderr, "\n\n");
-    }
-
-    if (params.verbose > 0 && params.interactive) {
-        fprintf(stderr, "== Running in interactive mode. ==\n"
-               " - Press Ctrl+C to interject at any time.\n"
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_first;
-    }
-
-    remember_init();
-
-    input_noecho = params.verbose <= 0;
-
-    n_past     = 0;
-    n_remain   = params.n_predict;
-    n_consumed = 0;
-
-    // instantly reload prompt if it's cached
-    int fd = open(params.prompt_path.c_str(), O_RDONLY);
-    if (fd != -1) {
-        size_t state_size;
-        size_t prompt_size;
-        struct timespec mtim;
-        struct jtlp_header *header;
-        off_t rc = lseek(fd, 0, SEEK_END);
-        LLAMA_ASSERT(rc != -1);
-        void *map = MAP_FAILED;
-        size_t file_size = rc;
-        if (file_size < sizeof(header)) {
-            fprintf(stderr, "%s: prompt file too small\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        map = mmap(0, file_size, PROT_READ, MAP_SHARED, fd, 0);
-        if (map == MAP_FAILED) {
-            fprintf(stderr, "%s: mmap failed: %s\n",
-                    params.prompt_path.c_str(), strerror(errno));
-            goto CantReloadPrompt;
-        }
-        header = (struct jtlp_header *)map;
-        // check file format magic
-        if (READ32LE(header->magic) != kJtlpMagic) {
-            fprintf(stderr, "%s: prompt file has wrong magic\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check file format version
-        if (READ32LE(header->version) > kJtlpVersion) {
-            fprintf(stderr, "%s: prompt has future file format version\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check expected state size
-        state_size = llama_get_state_size(ctx);
-        if (READ64LE(header->state_size) != state_size) {
-            if (params.verbose > 0) {
-                fprintf(stderr, "%s: prompt has stale data state size\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // check model device id
-        if (READ64LE(header->model_dev) != model_stat.st_dev) {
-            fprintf(stderr, "%s: prompt is for different model (dev)\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check model inode id
-        if (READ64LE(header->model_ino) != model_stat.st_ino) {
-            fprintf(stderr, "%s: prompt is for different model (ino)\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check model modified timestamp
-        mtim.tv_sec = READ64LE(header->model_mtim_sec);
-        mtim.tv_nsec = READ64LE(header->model_mtim_nsec);
-        if (CompareTime(model_stat.st_mtim, mtim) > 0) {
-            if (params.verbose > 0) {
-                fprintf(stderr, "%s: model file timestamp changed; will reload and regenerate prompt\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // check prompt file size
-        prompt_size = READ64LE(header->prompt_size);
-        if (sizeof(struct jtlp_header) + prompt_size + state_size > file_size) {
-            fprintf(stderr, "%s: prompt file size unexpected\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check prompt textus
-        if (prompt_size != params.prompt.size() ||
-            memcmp(header + 1, params.prompt.c_str(), prompt_size) != 0) {
-            if (params.verbose > 0) {
-                fprintf(stderr, "%s: prompt text changed; will reload and regenerate\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // read the transformer state
-        llama_set_state_data(ctx, (uint8_t *)(header + 1) + prompt_size);
-        // we're finished loading the prompt file
-        if (params.verbose > 0) {
-            fprintf(stderr, "%s: %s: reloaded previously saved prompt\n",
-                    __func__, params.prompt_path.c_str());
-        }
-        // now setup the business logic
-        llama_set_rng_seed(ctx, params.seed);
-        while ((int) embd_inp.size() > n_consumed) {
-            remember_token(embd_inp[n_consumed++]);
-        }
-        n_past = n_consumed;
-        finish_initializing_prompt();
-  CantReloadPrompt:
-        if (map != MAP_FAILED) {
-            munmap(map, file_size);
-        }
-        close(fd);
-    }
-
-    if (prompt_status == kPromptPending && params.verbose > 0) {
-        // the first thing we will do is to output the prompt, so set color accordingly
-        console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-    }
-
-    std::vector<llama_token> embd;
-
-    if (prompt_status == kPromptPending &&
-        !params.verbose && con_st.use_color) {
-        fprintf(stderr, EPHEMERAL("loading weights..."));
-    }
-
-    // tracks if last character written to stdout was newline
-    bool got_newline = false;
-
-    while ((n_remain != 0 || params.interactive) && !is_terminated) {
-
-        // perform evaluation
-        if (embd.size() > 0) {
-            DEVLOG("perform evaluation embd.size()=%d\n", (int)embd.size());
-            if (n_past + (int) embd.size() > n_ctx) {
-                n_past = n_keep;
-                embd.insert(embd.begin(),
-                            last_n_tokens.end() - (n_past - n_keep) / 2 - embd.size(),
-                            last_n_tokens.end() - embd.size());
-            }
-            for (int i = 0; i < (int) embd.size() && !is_terminated; i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-                is_stalled = n_eval > 1;
-                DEVLOG("llama_eval(n_evel=%d, n_past=%d)\n", n_eval, n_past);
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
-                    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-                    return 1;
-                }
-                is_stalled = false;
-                n_past += n_eval;
-                if (prompt_status == kPromptPending &&
-                    !params.verbose && con_st.use_color && embd_inp.size()) {
-                    fprintf(stderr, EPHEMERAL("loading prompt %d%% ..."),
-                            (int)(n_consumed / (double)embd_inp.size() * 100));
-                }
-            }
-            if (is_terminated) {
-                break;
-            }
-            embd.clear();
-        }
-
-        // save prompt to disk atomically as soon as it's finished loading
-        bool just_finished_initializing_prompt = prompt_status == kPromptCompleted;
-        if (just_finished_initializing_prompt && !params.prompt_path.empty()) {
-            int fd = -1;
-            int close_rc;
-            size_t file_size;
-            size_t state_size;
-            std::string tmppath;
-            void *map = MAP_FAILED;
-            struct jtlp_header header;
-            if (!params.verbose && con_st.use_color) {
-                fprintf(stderr, EPHEMERAL("caching prompt..."));
-            }
-            state_size = llama_get_state_size(ctx);
-            WRITE32LE(header.magic, kJtlpMagic);
-            WRITE32LE(header.version, kJtlpVersion);
-            WRITE64LE(header.state_size, state_size);
-            WRITE64LE(header.model_dev, model_stat.st_dev);
-            WRITE64LE(header.model_ino, model_stat.st_ino);
-            WRITE64LE(header.model_mtim_sec, model_stat.st_mtim.tv_sec);
-            WRITE64LE(header.model_mtim_nsec, model_stat.st_mtim.tv_nsec);
-            WRITE64LE(header.prompt_size, params.prompt.size());
-            file_size = sizeof(header) + params.prompt.size() + state_size;
-            tmppath.append(params.prompt_path);
-            tmppath.append(".XXXXXX");
-            fd = mkstemp(&tmppath[0]);
-            if (fd == -1) {
-                fprintf(stderr, "%s: mkstemp failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            if (ftruncate(fd, file_size)) {
-                fprintf(stderr, "%s: ftruncate failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            map = mmap(0, file_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-            if (map == MAP_FAILED) {
-                fprintf(stderr, "%s: mmap failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            llama_copy_state_data(ctx, (uint8_t *)map + sizeof(header) + params.prompt.size());
-            memcpy((uint8_t *)map + sizeof(header), params.prompt.c_str(), params.prompt.size());
-            memcpy(map, &header, sizeof(header));
-            if (msync(map, file_size, MS_ASYNC) && params.verbose > 0) {
-                fprintf(stderr, "%s: msync failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-            }
-            if (munmap(map, file_size) && params.verbose > 0) {
-                fprintf(stderr, "%s: munmap failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-            }
-            map = MAP_FAILED;
-            close_rc = close(fd);
-            fd = -1;
-            if (close_rc) {
-                fprintf(stderr, "%s: close failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            if (rename(tmppath.c_str(), params.prompt_path.c_str())) {
-                fprintf(stderr, "%s -> %s: rename failed: %s\n",
-                        tmppath.c_str(), params.prompt_path.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            tmppath.clear();
-      CouldNotSavePrompt:
-            if (map != MAP_FAILED) munmap(map, file_size);
-            if (fd != -1) close(fd);
-            if (!tmppath.empty()) unlink(tmppath.c_str());
-        }
-        if (just_finished_initializing_prompt) {
-            if (!params.verbose && con_st.use_color) {
-                fprintf(stderr, EPHEMERAL(""));
-            }
-            finish_initializing_prompt();
-        }
-
-        if (prompt_status == kPromptFinished &&
-            (int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // out of user input, sample next token
-            DEVLOG("out of user input, sample next token w/ embd_inp.size()=%d n_consumed=%d\n",
-                   (int)embd_inp.size(), n_consumed);
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty  = params.repeat_penalty;
-            const float   alpha_presence  = params.presence_penalty;
-            const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-            const bool    penalize_nl     = params.penalize_nl;
-
-            llama_token id = 0;
-
-            {
-                auto logits  = llama_get_logits(ctx);
-                auto n_vocab = llama_n_vocab(ctx);
-
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<llama_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                llama_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = llama_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token(ctx, &candidates_p);
-                    }
-                }
-
-                remember_token(id);
-            }
-
-            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
-                id = llama_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            // echo this to console
-            input_noecho = false;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-        } else {
-            DEVLOG("some user input remains from prompt or interaction w/ embd_inp.size()=%d n_consumed=%d\n",
-                   (int)embd_inp.size(), n_consumed);
-            // some user input remains from prompt or interaction, forward it to processing
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-                remember_token(embd_inp[n_consumed++], true);
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-
-            // we've nearly finished loading the prompt
-            if (prompt_status == kPromptPending &&
-                (int) embd_inp.size() <= n_consumed) {
-                prompt_status = kPromptCompleted;
-            }
-        }
-
-        // checks for reverse prompt
-        //
-        // 1. in interactive mode, this lets us detect when the llm is
-        //    prompting the user, so we can pause for input, e.g.
-        //
-        //       --interactive
-        //       --prompt $'CompanionAI: How can I help you?\nHuman:'
-        //       --reverse-prompt 'Human:'
-        //
-        // 2. in normal mode, the reverse prompt can be used to specify
-        //    a custom EOS token, e.g.
-        //
-        //       --prompt 'Question: How old are you?\nAnswer: '
-        //       --reverse-prompt $'\n'
-        //
-        bool is_antiprompt;
-        std::string ap_text;
-        std::string::size_type ap_extra;
-        std::string::size_type ap_index;
-        if (prompt_status == kPromptFinished) {
-            is_antiprompt = has_antiprompt(&ap_index, &ap_text);
-        } else {
-            is_antiprompt = false;
-        }
-
-        // display text
-        if (!input_noecho && embd.size()) {
-            std::string printme;
-            for (auto id : embd) {
-                printme.append(llama_token_to_str(ctx, id));
-            }
-            if (is_antiprompt) {
-                ap_extra = last_output.size() - ap_index;
-                printme.erase(std::max(0, (int)(printme.size() - ap_extra)));
-            }
-            if (printme.size()) {
-                got_newline = printme[printme.size() - 1] == '\n';
-                printf("%s", printme.c_str());
-                fflush(stdout);
-            }
-        }
-        if (is_antiprompt) {
-            if (!params.interactive) {
-                DEVLOG("exiting due to antiprompt\n");
-                if (!got_newline) {
-                    printf("\n");
-                }
-                break;
-            }
-            // scrub antiprompt so to detect it must be typed again
-            last_output.erase(0, ap_index + ap_text.size());
-            DEVLOG("scrubbed antiprompt -> %`'s\n", last_output.c_str());
-        }
-        if (prompt_status == kPromptCompleted) {
-            DEVLOG("avoid reading line before last token loads\n");
-            continue;  // avoid reading line before last token loads
-        }
-
-        // reset color to default if we there is no pending user input
-        if (params.verbose && !input_noecho && (int)embd_inp.size() == n_consumed) {
-            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-        }
-
-        if (is_antiprompt) {
-            is_interacting = true;
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            fflush(stdout);
-        }
-
-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
-
-            if (n_past > 0 && is_interacting) {
-
-                // potentially set color to indicate we are taking user input
-                console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-
-                if (params.instruct) {
-                    printf("\n> ");
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
-                }
-
-                // display a "waiting for input" indicator, just in case
-                // the model doesn't halt on the antiprompt.
-                if (con_st.use_color) {
-                    fprintf(stdout, "?\b");
-                    fflush(stdout);
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console_readline(con_st, line);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
-                    }
-
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-                    }
-
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                    }
-
-                    n_remain -= line_inp.size();
-                }
-
-                input_noecho = true; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                is_interacting = false;
-            }
-            assert(!is_interacting);
-        }
-
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (params.instruct) {
-                is_interacting = true;
-            } else if (params.verbose > 0) {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-
-    if (is_terminated) {
-        console_cleanup(con_st);
-        printf("\n");
-        if (params.verbose > 0) {
-            llama_print_timings(ctx);
-        }
-        _exit(128 + SIGINT);
-    }
-
-    if (params.verbose > 0) {
-        llama_print_timings(ctx);
-    }
-    llama_free(ctx);
-
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-    return 0;
-}
--- a/third_party/ggml/perplexity.cc
+++ b/third_party/ggml/perplexity.cc
@ -1,196 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/common.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/libcxx/vector"
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-std::vector<float> softmax(const std::vector<float>& logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) max_logit = std::max(max_logit, v);
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
-    return probs;
-}
-
-void perplexity(llama_context * ctx, const gpt_params & params) {
-    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
-    // Output: `perplexity: 13.5106 [114/114]`
-    // BOS tokens will be added for each chunk before eval
-    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
-
-    int count   = 0;
-
-    const int n_chunk = tokens.size() / params.n_ctx;
-    const int n_vocab = llama_n_vocab(ctx);
-    const int n_batch = params.n_batch;
-
-    double nll = 0.0;
-    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
-
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * params.n_ctx;
-        const int end   = start + params.n_ctx;
-
-        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        for (int j = 0; j < num_batches; ++j) {
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
-            }
-
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
-            }
-
-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            const auto batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
-        }
-
-        // We get the logits for all the tokens in the context window (params.n_ctx)
-        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half of the window (so the model always has
-        // some context to predict the token).
-        //
-        // We rely on the fact that attention in the forward pass only looks at previous
-        // tokens here, so the logits returned for each token are an accurate representation
-        // of what the model would have predicted at that point.
-        //
-        // Example, we have a context window of 512, we will compute perplexity for each of the
-        // last 256 tokens.  Then, we split the input up into context window size chunks to
-        // process the entire prompt.
-        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
-            // Calculate probability of next token, given the previous ones.
-            const std::vector<float> tok_logits(
-                logits.begin() + (j + 0) * n_vocab,
-                logits.begin() + (j + 1) * n_vocab);
-
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
-
-            nll += -std::log(prob);
-            ++count;
-        }
-        // perplexity is e^(average negative log-likelihood)
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
-    }
-    printf("\n");
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";
-
-    params.n_batch = 512;
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    params.perplexity = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    llama_context * ctx;
-
-    // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    perplexity(ctx, params);
-
-    llama_print_timings(ctx);
-    llama_free(ctx);
-
-    return 0;
-}
--- a/third_party/ggml/quantize.cc
+++ b/third_party/ggml/quantize.cc
@ -1,122 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/log/log.h"
-#include "libc/runtime/runtime.h"
-#include "third_party/ggml/common.h"
-#include "third_party/ggml/ggml.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/map"
-#include "third_party/libcxx/vector"
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-
-static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
-    {"f32",  LLAMA_FTYPE_ALL_F32 },
-    {"f16",  LLAMA_FTYPE_MOSTLY_F16 },
-    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
-    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
-};
-
-// usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type [nthreads]
-//
-int main(int argc, char ** argv) {
-    verynice();
-    ShowCrashReports();
-
-    ggjt_v3();
-    ggml_time_init();
-
-    if (argc < 3) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthreads]\n", argv[0]);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    llama_init_backend();
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    if (fname_inp == fname_out) {
-        fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
-        exit(1);
-    }
-
-    enum llama_ftype ftype;
-    if (!is_integer_str(argv[3])) {
-        auto it = LLAMA_FTYPE_MAP.find(argv[3]);
-        if (it == LLAMA_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum llama_ftype)atoi(argv[3]);
-    }
-
-    int nthread = argc > 4 ? atoi(argv[4]) : std::min(20, std::max(1, __get_cpu_count() >> 1));
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
-    }
-
-    return 0;
-}
--- a/third_party/radpajama/BUILD.mk
+++ b/third_party/radpajama/BUILD.mk
@ -1,145 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
-
-PKGS += THIRD_PARTY_RADPAJAMA
-
-################################################################################
-# redpajama library code common to both executables below
-
-THIRD_PARTY_RADPAJAMA_ARTIFACTS += THIRD_PARTY_RADPAJAMA_A
-THIRD_PARTY_RADPAJAMA = $(THIRD_PARTY_RADPAJAMA_A_DEPS) $(THIRD_PARTY_RADPAJAMA_A)
-THIRD_PARTY_RADPAJAMA_A = o/$(MODE)/third_party/radpajama/radpajama.a
-THIRD_PARTY_RADPAJAMA_A_OBJS = $(THIRD_PARTY_RADPAJAMA_A_SRCS:%.cc=o/$(MODE)/%.o)
-THIRD_PARTY_RADPAJAMA_A_FILES = $(THIRD_PARTY_RADPAJAMA_A_SRCS) $(THIRD_PARTY_RADPAJAMA_A_HDRS)
-THIRD_PARTY_RADPAJAMA_A_CHECKS = $(THIRD_PARTY_RADPAJAMA_A).pkg $(THIRD_PARTY_RADPAJAMA_A_HDRS:%=o/$(MODE)/%.okk)
-
-THIRD_PARTY_RADPAJAMA_A_HDRS =						\
-	third_party/radpajama/common-gptneox.h				\
-	third_party/radpajama/gptneox-util.h				\
-	third_party/radpajama/gptneox.h
-
-THIRD_PARTY_RADPAJAMA_A_SRCS =						\
-	third_party/radpajama/common-gptneox.cc				\
-	third_party/radpajama/gptneox.cc				\
-
-THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS =					\
-	LIBC_CALLS							\
-	LIBC_FMT							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_STR							\
-	LIBC_SYSV							\
-	LIBC_THREAD							\
-	LIBC_TINYMATH							\
-	THIRD_PARTY_COMPILER_RT						\
-	THIRD_PARTY_GGML						\
-	THIRD_PARTY_LIBCXX
-
-THIRD_PARTY_RADPAJAMA_A_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS),$($(x))))
-
-$(THIRD_PARTY_RADPAJAMA_A):						\
-		third_party/radpajama/					\
-		$(THIRD_PARTY_RADPAJAMA_A).pkg				\
-		$(THIRD_PARTY_RADPAJAMA_A_OBJS)
-
-$(THIRD_PARTY_RADPAJAMA_A).pkg:						\
-		$(THIRD_PARTY_RADPAJAMA_A_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS),$($(x)_A).pkg)
-
-################################################################################
-# two executable programs for running inference on redpajama models
-#
-#     make -j8 o//third_party/radpajama/radpajama.com
-#     make -j8 o//third_party/radpajama/radpajama-chat.com
-#     make -j8 o//third_party/radpajama/radpajama-copy.com
-#     make -j8 o//third_party/radpajama/radpajama-quantize.com
-
-THIRD_PARTY_RADPAJAMA_ARTIFACTS += THIRD_PARTY_RADPAJAMA_MAIN
-THIRD_PARTY_RADPAJAMA_MAIN_OBJS = $(THIRD_PARTY_RADPAJAMA_MAIN_SRCS:%.cc=o/$(MODE)/%.o)
-THIRD_PARTY_RADPAJAMA_MAIN_BINS = $(THIRD_PARTY_RADPAJAMA_COMS) $(THIRD_PARTY_RADPAJAMA_COMS:%=%.dbg)
-
-THIRD_PARTY_RADPAJAMA_MAIN_COMS =					\
-	o/$(MODE)/third_party/radpajama/radpajama.com			\
-	o/$(MODE)/third_party/radpajama/radpajama-chat.com		\
-	o/$(MODE)/third_party/radpajama/radpajama-copy.com		\
-	o/$(MODE)/third_party/radpajama/radpajama-quantize.com
-
-THIRD_PARTY_RADPAJAMA_MAIN_SRCS =					\
-	third_party/radpajama/main-redpajama.cc				\
-	third_party/radpajama/main-redpajama-chat.cc			\
-	third_party/radpajama/copy-gptneox.cc				\
-	third_party/radpajama/quantize-gptneox.cc
-
-THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS =					\
-	LIBC_CALLS							\
-	LIBC_FMT							\
-	LIBC_INTRIN							\
-	LIBC_LOG							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-        LIBC_PROC							\
-	LIBC_STDIO							\
-	LIBC_STR							\
-	THIRD_PARTY_GGML						\
-	THIRD_PARTY_RADPAJAMA						\
-	THIRD_PARTY_LIBCXX
-
-THIRD_PARTY_RADPAJAMA_MAIN_DEPS :=					\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS),$($(x))))
-
-o/$(MODE)/third_party/radpajama/main.pkg:				\
-		$(THIRD_PARTY_RADPAJAMA_MAIN_OBJS)			\
-		$(foreach x,$(THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/third_party/radpajama/radpajama.com.dbg:			\
-		o/$(MODE)/third_party/radpajama/main.pkg		\
-		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
-		o/$(MODE)/third_party/radpajama/main-redpajama.o	\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/third_party/radpajama/radpajama-chat.com.dbg:			\
-		o/$(MODE)/third_party/radpajama/main.pkg		\
-		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
-		o/$(MODE)/third_party/radpajama/main-redpajama-chat.o	\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/third_party/radpajama/radpajama-copy.com.dbg:			\
-		o/$(MODE)/third_party/radpajama/main.pkg		\
-		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
-		o/$(MODE)/third_party/radpajama/copy-gptneox.o		\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-o/$(MODE)/third_party/radpajama/radpajama-quantize.com.dbg:		\
-		o/$(MODE)/third_party/radpajama/main.pkg		\
-		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
-		o/$(MODE)/third_party/radpajama/quantize-gptneox.o	\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-################################################################################
-# package level definitions
-
-THIRD_PARTY_RADPAJAMA_LIBS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)))
-THIRD_PARTY_RADPAJAMA_COMS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_COMS))
-THIRD_PARTY_RADPAJAMA_BINS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_BINS))
-THIRD_PARTY_RADPAJAMA_SRCS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_SRCS))
-THIRD_PARTY_RADPAJAMA_HDRS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_HDRS))
-THIRD_PARTY_RADPAJAMA_OBJS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_OBJS))
-THIRD_PARTY_RADPAJAMA_CHECKS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_CHECKS))
-$(THIRD_PARTY_RADPAJAMA_OBJS): third_party/radpajama/BUILD.mk
-
-.PHONY: o/$(MODE)/third_party/radpajama
-o/$(MODE)/third_party/radpajama:					\
-		$(THIRD_PARTY_RADPAJAMA_BINS)				\
-		$(THIRD_PARTY_RADPAJAMA_CHECKS)
--- a/third_party/radpajama/LICENSE
+++ b/third_party/radpajama/LICENSE
@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2023 Georgi Gerganov
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/third_party/radpajama/README.cosmo
+++ b/third_party/radpajama/README.cosmo
@ -1,19 +0,0 @@
-DESCRIPTION
-
-  radpajama is a port of ggml for the open source Red Pajama LLM. It started as a fork of redpajama.cpp from Together Computer.
-
-LICENSE
-
-  MIT
-
-ORIGIN
-
-  github.com/togethercomputer/redpajama.cpp/
-  commit bfa6466199b8ef92185ecb72e2a550e12baf6602
-  Author: Szhangce <czhang@cs.stanford.edu>
-  Date:   Tue May 9 00:50:22 2023 +0200
-  radpajama : Update README.md 
-
-LOCAL CHANGES
-
-  - Updated headers for COSMO build.
--- a/third_party/radpajama/README.md
+++ b/third_party/radpajama/README.md
@ -1,143 +0,0 @@
-# gglm Support for RedPajama Model
-
-## Ackonwledgement 
-
-We highly appreciate the great effort from the fork of [gptneox.cpp](https://github.com/byroneverson/gptneox.cpp). Our support of the RedPajama Model is mainly based on this implementation. We extend the model configure and fixed a bug when setting use_parallel_residual flag to False in their original implementation. We also extend the chat model for RedPajama.
-
-## Usage:
-
-### RedPajama Chat model:
-
- Make the code:
-
-        make redpajama-chat quantize-gptneox
-
-
- Prepare the RedPajama model (f16 and q4_0) for gglm:
-
-        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
-
- Run RedPajama chat model (fp16):
-
-        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin \
-        -c 2048 \
-        -b 128 \
-        -n 1 \
-        -t 8 \
-        --instruct \
-        --color \
-        --top_k 30 \
-        --top_p 0.95 \
-        --temp 0.8 \
-        --repeat_last_n 3 \
-        --repeat_penalty 1.1 \
-        --seed 0
-
-    Note that you may need to install torch and transformers to run the above scripts, e.g.:
-        
-        pip install torch==2.0.0
-        pip install transformers==4.28.1
-
-
- Run RedPajama chat model (q4_0):
-
-        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-q4_0.bin \
-        -c 2048 \
-        -b 128 \
-        -n 1 \
-        -t 8 \
-        --instruct \
-        --color \
-        --top_k 30 \
-        --top_p 0.95 \
-        --temp 0.8 \
-        --repeat_last_n 3 \
-        --repeat_penalty 1.1 \
-        --seed 0
-
- Run other quantized version of RedPajama Chat model (Make sure you get the f16 model prepared before you run this):
-
-  - Make the code to quantize the model if you have not:
-
-        make quantize-gptneox
-
-  - Generate the quantized model, the supported types include: q4_0, q4_1, q4_2, q5_0, q5_1, and q8_0. For example, to run q4_1, you need to do the following convertion:
-
-        python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin --quantize-output-type q4_1
-
-  - Then you can chat with the quantized model:
-
-        ./redpajama-chat -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-q4_1.bin \
-        -c 2048 \
-        -b 128 \
-        -n 1 \
-        -t 8 \
-        --instruct \
-        --color \
-        --top_k 30 \
-        --top_p 0.95 \
-        --temp 0.8 \
-        --repeat_last_n 3 \
-        --repeat_penalty 1.1 \
-        --seed 0
-
-
-
-
-### RedPajama Base/Instruct model:
-
- Make the code:
-
-        make redpajama quantize-gptneox
-
-
- Prepare the RedPajama Base/Instruct model (f16 and q4_0) for gglm:
-
-        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
-
-        # Or 
-
-        bash ./examples/redpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
-
- Run other quantize version of RedPajama Base/Instruct model (Make sure you get the f16 model prepared before you run this). Then you can generate the quantized model, the supported types include: q4_0, q4_1, q4_2, q5_0, q5_1, and q8_0. For example, to run q4_1, you need to do the following convertion, e.g for RedPajama-Base q8_0:
-
-        python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Base-3B-v1-f16.bin --quantize-output-type q8_0
-
- Run RedPajama Base/Instruct model (e.g., RedPajama-Instruct q8_0) :
-
-        ./redpajama -m ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-q8_0.bin \
-        -c 2048 \
-        -b 128 \
-        -n 1 \
-        -t 8 \
-        --color \
-        --top_k 30 \
-        --top_p 0.95 \
-        --temp 0.8 \
-        --repeat_last_n 3 \
-        --repeat_penalty 1.1 \
-        --seed 0 \
-        --n_predict 256 \
-        --verbose-prompt \
-        -p "How to schedule a tour to Anfield:"
-
-
-## Attribution
-
-The following files are covered by a MIT license and were taken from:
-
-https://github.com/byroneverson/gptneox.cpp
-
-Thank you Byron.
-
-```
-common-gptneox.cpp	
-copy-gptneox.cpp	
-gptneox.cpp		
-quantize-gptneox.cpp
-common-gptneox.h	
-gptneox-util.h		
-gptneox.h
-convert_gptneox_to_ggml.py
-quantize-gptneox.py
-```
--- a/third_party/radpajama/common-gptneox.cc
+++ b/third_party/radpajama/common-gptneox.cc
@ -1,393 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  radpajama.com                                                               │
-│  Copyright (c) 2023 Ariel Núñez                                              │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/radpajama/common-gptneox.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/iterator"
-#include "third_party/libcxx/sstream"
-#include "third_party/libcxx/string"
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    params.n_threads = std::min(20., (unsigned)__get_cpu_count() * 0.75);
-
-    bool invalid_param = false;
-    std::string arg;
-    gpt_params default_params;
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(argv[i]);
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "--session") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.path_session = argv[i];
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top_k") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f32") {
-            params.memory_f16 = false;
-        } else if (arg == "--top_p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_p = std::stof(argv[i]);
-        } else if (arg == "--temp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.temp = std::stof(argv[i]);
-        } else if (arg == "--tfs") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.tfs_z = std::stof(argv[i]);
-        } else if (arg == "--typical") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.typical_p = std::stof(argv[i]);
-        } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_last_n = std::stoi(argv[i]);
-        } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--frequency_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.frequency_penalty = std::stof(argv[i]);
-        } else if (arg == "--presence_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.presence_penalty = std::stof(argv[i]);
-        } else if (arg == "--mirostat") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat_lr") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat_ent") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
-        } else if (arg == "--keep") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_keep = std::stoi(argv[i]);
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter = argv[i];
-            params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "--embedding") {
-            params.embedding = true;
-        } else if (arg == "--interactive-first") {
-            params.interactive_first = true;
-        } else if (arg == "-ins" || arg == "--instruct") {
-            params.instruct = true;
-        } else if (arg == "--color") {
-            params.use_color = true;
-        } else if (arg == "--mlock") {
-            params.use_mlock = true;
-        } else if (arg == "--no-mmap") {
-            params.use_mmap = false;
-        } else if (arg == "--mtest") {
-            params.mem_test = true;
-        } else if (arg == "--verbose-prompt") {
-            params.verbose_prompt = true;
-        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.antiprompt.push_back(argv[i]);
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
-        } else if (arg == "--ignore-eos") {
-            params.logit_bias[gptneox_token_eos()] = -INFINITY;
-        } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
-        } else if (arg == "-l" || arg == "--logit-bias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::stringstream ss(argv[i]);
-            gptneox_token key = 0;
-            char sign = 0;
-            std::string value_str;
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, default_params);
-            exit(0);
-        } else if (arg == "--random-prompt") {
-            params.random_prompt = true;
-        } else if (arg == "--in-prefix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_prefix = argv[i];
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, default_params);
-        exit(1);
-    }
-
-    return true;
-}
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
-    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  -ins, --instruct      run in instruction mode\n");
-    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
-    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
-    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
-    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
-    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
-    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(stderr, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
-    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(stderr, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(stderr, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
-    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    if (gptneox_mlock_supported()) {
-        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (gptneox_mmap_supported()) {
-        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
-    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-        default: return "To";
-    }
-
-    return "The";
-}
-
-// TODO: not great allocating this every time
-std::vector<gptneox_token> gptneox_tokenize(struct gptneox_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<gptneox_token> res(text.size() + (int)add_bos);
-    int n = gptneox_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void set_console_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                printf(ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                printf(ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                printf(ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-        }
-        con_st.color = color;
-    }
-}
--- a/third_party/radpajama/common-gptneox.h
+++ b/third_party/radpajama/common-gptneox.h
@ -1,111 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
-#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
-#include "libc/macros.internal.h"
-#include "libc/runtime/runtime.h"
-#include "third_party/libcxx/random"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/thread"
-#include "third_party/libcxx/unordered_map"
-#include "third_party/libcxx/vector"
-#include "third_party/radpajama/gptneox.h"
-// Various helper functions and utilities
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = MIN(20., (unsigned) __get_cpu_count() * 0.75);
-    int32_t n_predict     = 128;  // new tokens to predict
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
-
-    // sampling parameters
-    std::unordered_map<gptneox_token, float> logit_bias; // logit bias for specific tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    std::string model  = "./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat/Instruct-3B-v1-f16.bin"; // model path
-    std::string prompt = "";
-    std::string path_session = "";       // path to file for saving/loading model eval state
-    std::string input_prefix = "";       // string to prefix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-
-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base = "";     // base model path for the lora adapter
-
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
-    bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = false; // use color to distinguish generations and inputs
-    bool interactive       = false; // interactive mode
-
-    bool embedding         = false; // get only sentence embedding
-    bool interactive_first = false; // wait for user input immediately
-
-    bool instruct          = false; // instruction mode
-    bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
-    bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mlock         = false; // use mlock to keep model in memory
-    bool mem_test          = false; // compute maximum memory usage
-    bool verbose_prompt    = false; // print prompt tokens before generation
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-std::vector<gptneox_token> gptneox_tokenize(struct gptneox_context * ctx, const std::string & text, bool add_bos);
-
-//
-// Console utils
-//
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-enum console_color_t {
-    CONSOLE_COLOR_DEFAULT=0,
-    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
-};
-
-struct console_state {
-    bool use_color = false;
-    console_color_t color = CONSOLE_COLOR_DEFAULT;
-};
-
-void set_console_color(console_state & con_st, console_color_t color);
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color);
-void win32_utf8_encode(const std::wstring & wstr, std::string & str);
-#endif
-
-#endif /* COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_ */
--- a/third_party/radpajama/copy-gptneox.cc
+++ b/third_party/radpajama/copy-gptneox.cc
@ -1,89 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  radpajama.com                                                               │
-│  Copyright (c) 2023 Ariel Núñez                                              │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/log/log.h"
-#include "third_party/ggml/ggml.h"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/map"
-#include "third_party/libcxx/string"
-#include "third_party/radpajama/gptneox.h"
-
-static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
-  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
-  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
-  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
-};
-
-// usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    verynice();
-    ShowCrashReports();
-
-    ggjt_v1();
-    ggml_time_init();
-
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin ftype\n", argv[0]);
-        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    enum gptneox_ftype ftype;
-    if (argv[3][0] == 'q') {
-        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
-        if (it == GPTNEOX_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum gptneox_ftype)atoi(argv[3]);
-    }
-
-    gptneox_model_copy(fname_inp.c_str(), fname_out.c_str(), ftype);
-
-    return 0;
-}
--- a/third_party/radpajama/gptneox-util.h
+++ b/third_party/radpajama/gptneox-util.h
@ -1,439 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef GPTNEOX_UTIL_H
-#define GPTNEOX_UTIL_H
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/rlimit.h"
-#include "libc/calls/struct/rusage.h"
-#include "libc/calls/weirdtypes.h"
-#include "libc/errno.h"
-#include "libc/runtime/pathconf.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/fileno.h"
-#include "libc/sysv/consts/madv.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/mfd.h"
-#include "libc/sysv/consts/mlock.h"
-#include "libc/sysv/consts/mremap.h"
-#include "libc/sysv/consts/msync.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/ok.h"
-#include "libc/sysv/consts/posix.h"
-#include "libc/sysv/consts/prio.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/rlim.h"
-#include "libc/sysv/consts/rlimit.h"
-#include "libc/sysv/consts/rusage.h"
-#include "libc/time/time.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/cerrno"
-#include "third_party/libcxx/climits"
-#include "third_party/libcxx/cstdarg"
-#include "third_party/libcxx/cstdint"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstdlib"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-#include "third_party/musl/crypt.h"
-#include "third_party/musl/lockf.h"
-
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
-#define GPTNEOX_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GPTNEOX_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
-static std::string format(const char * fmt, ...) {
-    va_list ap, ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GPTNEOX_ASSERT(size >= 0 && size < INT_MAX);
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GPTNEOX_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), size);
-}
-
-struct gptneox_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    gptneox_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            Die("failed to open %s: %s", fname, std::strerror(errno));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        GPTNEOX_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        GPTNEOX_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            Die("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            Die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            Die("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~gptneox_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-#if defined(_WIN32)
-static std::string gptneox_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-struct gptneox_mmap {
-    void * addr;
-    size_t size;
-
-    gptneox_mmap(const gptneox_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    gptneox_mmap(struct gptneox_file * file, bool prefetch = true) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-#if defined(__linux__) || defined(__COSMOPOLITAN__)
-        flags |= MAP_POPULATE;
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            Die("mmap failed: %s", strerror(errno));
-        }
-
-        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~gptneox_mmap() {
-        munmap(addr, size);
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    gptneox_mmap(struct gptneox_file * file, bool prefetch = true) {
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
-
-        if (hMapping == NULL) {
-            Die("CreateFileMappingA failed: %s", gptneox_format_win_err(error).c_str());
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            Die("MapViewOfFile failed: %s", gptneox_format_win_err(error).c_str());
-        }
-
-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
-            WIN32_MEMORY_RANGE_ENTRY range;
-            range.VirtualAddress = addr;
-            range.NumberOfBytes = (SIZE_T)size;
-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                        gptneox_format_win_err(GetLastError()).c_str());
-            }
-        }
-        #else
-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
-    }
-
-    ~gptneox_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    gptneox_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    gptneox_mmap(struct gptneox_file *) {
-        Die("mmap not supported");
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct gptneox_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-    bool failed_already = false;
-
-    gptneox_mlock() {}
-    gptneox_mlock(const gptneox_mlock &) = delete;
-
-    ~gptneox_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * addr) {
-        GPTNEOX_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
-    }
-
-    void grow_to(size_t target_size) {
-        GPTNEOX_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) {
-        if (!mlock(addr, size)) {
-            return true;
-        } else {
-            char* errmsg = std::strerror(errno);
-            bool suggest = (errno == ENOMEM);
-
-            // Check if the resource limit is fine after all
-            struct rlimit lock_limit;
-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
-                suggest = false;
-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
-                suggest = false;
-
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-            return false;
-        }
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * addr, size_t size) {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, gptneox_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        gptneox_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        gptneox_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    gptneox_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    void raw_lock(const void * addr, size_t size) {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-    }
-
-    void raw_unlock(const void * addr, size_t size) {}
-#endif
-};
-
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct gptneox_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~gptneox_buffer() {
-        delete[] addr;
-    }
-};
-
-#ifdef GGML_USE_CUBLAS
-// MISSING #include "ggml-cuda.h"
-struct gptneox_ctx_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        if (addr) {
-            ggml_cuda_host_free(addr);
-        }
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
-        this->size = size;
-    }
-
-    ~gptneox_ctx_buffer() {
-        if (addr) {
-            ggml_cuda_host_free(addr);
-        }
-    }
-};
-#else
-typedef gptneox_buffer gptneox_ctx_buffer;
-#endif
-
-#endif
--- a/third_party/radpajama/gptneox.cc
+++ b/third_party/radpajama/gptneox.cc
--- a/third_party/radpajama/gptneox.h
+++ b/third_party/radpajama/gptneox.h
@ -1,272 +0,0 @@
-// -*- c++; c-basic-offset:4 -*-
-#ifndef GPTNEOX_H
-#define GPTNEOX_H
-
-#ifdef GPTNEOX_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GPTNEOX_BUILD
-#            define GPTNEOX_API __declspec(dllexport)
-#        else
-#            define GPTNEOX_API __declspec(dllimport)
-#        endif
-#    else
-#        define GPTNEOX_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define GPTNEOX_API
-#endif
-
-#define GPTNEOX_FILE_VERSION 1
-#define GPTNEOX_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define GPTNEOX_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct gptneox_context;
-
-    typedef int gptneox_token;
-
-    typedef struct gptneox_token_data {
-        gptneox_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
-    } gptneox_token_data;
-
-    typedef struct gptneox_token_data_array {
-        gptneox_token_data * data;
-        size_t size;
-        bool sorted;
-    } gptneox_token_data_array;
-
-    typedef void (*gptneox_progress_callback)(float progress, void *ctx);
-
-    struct gptneox_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the gptneox_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        gptneox_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    // model file types
-    enum gptneox_ftype {
-        GPTNEOX_FTYPE_ALL_F32     = 0,
-        GPTNEOX_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GPTNEOX_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GPTNEOX_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-        GPTNEOX_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GPTNEOX_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        // GPTNEOX_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        GPTNEOX_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GPTNEOX_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GPTNEOX_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-    };
-
-    GPTNEOX_API struct gptneox_context_params gptneox_context_default_params();
-
-    GPTNEOX_API bool gptneox_mmap_supported();
-    GPTNEOX_API bool gptneox_mlock_supported();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    GPTNEOX_API struct gptneox_context * gptneox_init_from_file(
-                             const char * path_model,
-            struct gptneox_context_params   params);
-
-    // Frees all allocated memory
-    GPTNEOX_API void gptneox_free(struct gptneox_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
-    GPTNEOX_API int gptneox_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-      enum gptneox_ftype   ftype,
-            int          nthread);
-
-    GPTNEOX_API int gptneox_model_copy(
-            const char * fname_inp,
-            const char * fname_out,
-            enum gptneox_ftype   ftype);
-
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
-    // Returns 0 on success
-    GPTNEOX_API int gptneox_apply_lora_from_file(
-            struct gptneox_context * ctx,
-                      const char * path_lora,
-                      const char * path_base_model,
-                             int   n_threads);
-
-    // Returns the number of tokens in the KV cache
-    GPTNEOX_API int gptneox_get_kv_cache_token_count(struct gptneox_context * ctx);
-
-    // Sets the current rng seed.
-    GPTNEOX_API void gptneox_set_rng_seed(struct gptneox_context * ctx, int seed);
-
-    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-    GPTNEOX_API size_t gptneox_get_state_size(struct gptneox_context * ctx);
-
-    // Copies the state to the specified destination address.
-    // Destination needs to have allocated enough memory.
-    // Returns the number of bytes copied
-    GPTNEOX_API size_t gptneox_copy_state_data(struct gptneox_context * ctx, uint8_t * dest);
-
-    // Set the state reading from the specified address
-    // Returns the number of bytes read
-    GPTNEOX_API size_t gptneox_set_state_data(struct gptneox_context * ctx, const uint8_t * src);
-
-    // Save/load session file
-    GPTNEOX_API size_t gptneox_load_session_file(struct gptneox_context * ctx, const char * path_session, gptneox_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-    GPTNEOX_API size_t gptneox_save_session_file(struct gptneox_context * ctx, const char * path_session, const gptneox_token * tokens, size_t n_token_count);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    GPTNEOX_API int gptneox_eval(
-            struct gptneox_context * ctx,
-               const gptneox_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    GPTNEOX_API int gptneox_tokenize(
-            struct gptneox_context * ctx,
-                      const char * text,
-                     gptneox_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    GPTNEOX_API int gptneox_n_vocab(struct gptneox_context * ctx);
-    GPTNEOX_API int gptneox_n_ctx  (struct gptneox_context * ctx);
-    GPTNEOX_API int gptneox_n_embd (struct gptneox_context * ctx);
-
-    // Token logits obtained from the last call to gptneox_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    GPTNEOX_API float * gptneox_get_logits(struct gptneox_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    GPTNEOX_API float * gptneox_get_embeddings(struct gptneox_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    GPTNEOX_API const char * gptneox_token_to_str(struct gptneox_context * ctx, gptneox_token token);
-
-    // String -> Token Id. Uses the vocabulary in the provided context
-    GPTNEOX_API gptneox_token gptneox_str_to_token(struct gptneox_context * ctx, const char * str);
-
-    // Special tokens
-    GPTNEOX_API gptneox_token gptneox_token_bos();
-    GPTNEOX_API gptneox_token gptneox_token_eos();
-    // GPTNEOX_API gptneox_token gptneox_token_nl();
-
-    // TODO: improve the last_n_tokens interface ?
-    GPTNEOX_API gptneox_token gptneox_sample_top_p_top_k(
-       struct gptneox_context * ctx,
-          const gptneox_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
-
-    // Sampling functions
-
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    GPTNEOX_API void gptneox_sample_repetition_penalty(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens, size_t last_tokens_size, float penalty);
-
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    GPTNEOX_API void gptneox_sample_frequency_and_presence_penalties(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    GPTNEOX_API void gptneox_sample_softmax(struct gptneox_context * ctx, gptneox_token_data_array * candidates);
-
-    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    GPTNEOX_API void gptneox_sample_top_k(struct gptneox_context * ctx, gptneox_token_data_array * candidates, int k, size_t min_keep);
-
-    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    GPTNEOX_API void gptneox_sample_top_p(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep);
-
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    GPTNEOX_API void gptneox_sample_tail_free(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float z, size_t min_keep);
-
-    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    GPTNEOX_API void gptneox_sample_typical(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep);
-    GPTNEOX_API void gptneox_sample_temperature(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float temp);
-
-    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `gptneox_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    GPTNEOX_API gptneox_token gptneox_sample_token_mirostat(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, int m, float * mu);
-
-    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `gptneox_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    GPTNEOX_API gptneox_token gptneox_sample_token_mirostat_v2(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, float * mu);
-
-    /// @details Selects the token with the highest probability.
-    GPTNEOX_API gptneox_token gptneox_sample_token_greedy(struct gptneox_context * ctx, gptneox_token_data_array * candidates);
-
-    /// @details Randomly selects a token from the candidates based on their probabilities.
-    GPTNEOX_API gptneox_token gptneox_sample_token(struct gptneox_context * ctx, gptneox_token_data_array * candidates);
-
-    // Performance information
-    GPTNEOX_API void gptneox_print_timings(struct gptneox_context * ctx);
-    GPTNEOX_API void gptneox_reset_timings(struct gptneox_context * ctx);
-
-    // Print system information
-    GPTNEOX_API const char * gptneox_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef GPTNEOX_API_INTERNAL
-
-#include "third_party/libcxx/vector"
-#include "third_party/libcxx/string"
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& gptneox_internal_get_tensor_map(struct gptneox_context * ctx);
-
-#endif
-
-#endif // GPTNEOX_H
--- a/third_party/radpajama/main-redpajama-chat.cc
+++ b/third_party/radpajama/main-redpajama-chat.cc
@ -1,383 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  radpajama.com                                                               │
-│  Copyright (c) 2023 Ariel Núñez                                              │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/calls/sigtimedwait.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/calls/weirdtypes.h"
-#include "libc/log/log.h"
-#include "libc/runtime/pathconf.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/fileno.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/ok.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/time/time.h"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cinttypes"
-#include "third_party/libcxx/cmath"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/ctime"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-#include "third_party/musl/crypt.h"
-#include "third_party/musl/lockf.h"
-#include "third_party/radpajama/common-gptneox.h"
-#include "third_party/radpajama/gptneox.h"
-
-static console_state con_st;
-static gptneox_context ** g_ctx;
-
-static bool is_interacting = false;
-
-void sigint_handler(int signo) {
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    printf("\n"); // this also force flush stdout.
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting=true;
-        } else {
-            gptneox_print_timings(*g_ctx);
-            _exit(130);
-        }
-    }
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    params.model = "./models/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin";
-
-    con_st.use_color = true;
-    params.n_ctx = 2048;
-    params.seed = 1684054676;
-    params.use_mmap = true;
-    params.use_mlock = true;
-    params.memory_f16 = true;
-    params.mem_test = false;
-    params.interactive = true;
-    params.top_k = 30;
-    params.top_p =  0.95;
-    params.temp = 0.8;
-    params.repeat_last_n = 3;
-    params.repeat_penalty = 1.1;
-    params.instruct = true;
-    params.interactive = true;
-
-    verynice();
-    ShowCrashReports();
-
-    if (gpt_params_parse(argc, argv, params) == false) {  return 1; }
-
-    std::mt19937 rng(params.seed);
-    gptneox_context * ctx;
-    g_ctx = &ctx;
-
-    {
-        auto lparams = gptneox_context_default_params();
-
-        lparams.n_ctx      = params.n_ctx;
-        lparams.n_parts    = params.n_parts;
-        lparams.seed       = params.seed;
-        lparams.f16_kv     = params.memory_f16;
-        lparams.use_mmap   = params.use_mmap;
-        lparams.use_mlock  = params.use_mlock;
-
-        ctx = gptneox_init_from_file(params.model.c_str(), lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-    }
-
-    if (!params.lora_adapter.empty()) {
-        int err = gptneox_apply_lora_from_file(ctx,
-                                               params.lora_adapter.c_str(),
-                                               params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                               params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
-    }
-
-    verynice();
-    ShowCrashReports();
-
-    // Always interactive for RedPajama chat model
-    params.interactive = true;
-
-    if (params.interactive) {
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-    }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", params.n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
-
-    // TODO: replace with ring-buffer
-    std::vector<gptneox_token> last_n_tokens = std::vector<gptneox_token>();
-
-    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
-
-    while (true) {
-        is_interacting = true;
-        int n_past = 0;
-        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-
-        if (params.instruct) {
-            printf("\n<human>: ");
-        }
-
-        std::string buffer;
-        if (!params.input_prefix.empty()) {
-            buffer += params.input_prefix;
-            printf("%s", buffer.c_str());
-        }
-
-        std::string line;
-        bool another_line = true;
-        do {
-            if (!std::getline(std::cin, line)) {
-                // input stream is bad or EOF received
-                return 0;
-            }
-            if (line.empty() || line.back() != '\\') {
-                another_line = false;
-            } else {
-                line.pop_back(); // Remove the continue character
-            }
-            buffer += line;
-            if (another_line) {
-                buffer += '\n';
-            }
-        } while (another_line);
-
-        is_interacting = false;
-
-        // done taking input, reset color
-        set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-        // Check for input
-        if (buffer.length() <= 0) {
-            continue; // Restart loop for input
-        }
-
-        auto prompt_embd = ::gptneox_tokenize(ctx, buffer, false);
-        auto embd_inp = std::vector<gptneox_token>();
-
-        embd_inp.push_back(gptneox_str_to_token(ctx, "<"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, "human"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, ">:"));
-
-        embd_inp.insert(embd_inp.end(), prompt_embd.begin(), prompt_embd.end());
-
-        embd_inp.push_back(gptneox_str_to_token(ctx, "\n"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, "<"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, "bot"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, ">:"));
-
-        // How many tokens to generate - check if theres space in context for atleast one token (or batch size tokens?)
-        int inp_size = embd_inp.size();
-        auto space = params.n_ctx - inp_size;
-        if(space <= 0) {
-            fprintf(stderr, "%s : input too long\n", __func__);
-            continue;
-        }
-        // Send batches to eval
-        while (n_past < inp_size) {
-            auto remaining = inp_size - n_past;
-            int n_eval = params.n_batch < remaining ? params.n_batch : remaining;
-            if (gptneox_eval(ctx, &embd_inp[n_past], n_eval, n_past, params.n_threads)) {
-                fprintf(stderr, "<bot>: %s : failed to eval\n", __func__);
-                return 1;
-            }
-            n_past += n_eval;
-        }
-
-        const int n_ctx = gptneox_n_ctx(ctx);
-        const int n_vocab = gptneox_n_vocab(ctx);
-
-        const float   temp            = params.temp;
-        const int32_t top_k           = params.top_k <= 0 ? gptneox_n_vocab(ctx) : params.top_k;
-        const float   top_p           = params.top_p;
-        const float   tfs_z           = params.tfs_z;
-        const float   typical_p       = params.typical_p;
-        const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-        const float   repeat_penalty  = params.repeat_penalty;
-        const float   alpha_presence  = params.presence_penalty;
-        const float   alpha_frequency = params.frequency_penalty;
-        const int     mirostat        = params.mirostat;
-        const float   mirostat_tau    = params.mirostat_tau;
-        const float   mirostat_eta    = params.mirostat_eta;
-        const bool    penalize_nl     = params.penalize_nl;
-
-        // Eval until space runs out
-        auto out_count = 0;
-
-        printf("<bot>:");
-        while (space > 0) {
-            // Get token
-            gptneox_token id = 0;
-
-            {
-                auto logits = gptneox_get_logits(ctx);
-
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<gptneox_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (gptneox_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(gptneox_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                gptneox_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-                gptneox_token nl_token = gptneox_str_to_token(ctx, "\n");
-                float nl_logit = logits[nl_token];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                gptneox_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                gptneox_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    logits[nl_token] = nl_logit;
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = gptneox_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        gptneox_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        gptneox_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        gptneox_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        gptneox_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token(ctx, &candidates_p);
-                    }
-                }
-            }
-
-            // Inc out count and dec space
-            out_count += 1;
-            space -= 1;
-            // Repeat tokens update
-            last_n_tokens.push_back(id);
-            if ((int)last_n_tokens.size() > params.repeat_last_n) {
-                last_n_tokens.erase(last_n_tokens.begin());
-            }
-            // Redpajama: check if the interactive is done.
-            //std::cout<<" last_n_tokens.size: "<< last_n_tokens[0] <<" "<< last_n_tokens[1] <<" "<< last_n_tokens[2] << std::endl;
-            if (last_n_tokens.size()==3 && last_n_tokens[0]==gptneox_str_to_token(ctx, "<")
-            && last_n_tokens[1]==gptneox_str_to_token(ctx, "human") && last_n_tokens[2]==gptneox_str_to_token(ctx, ">:")){
-                space = 0;
-                continue;
-            }
-
-            // Check for eos - end early - check eos before bos in case they are the same
-            if (id == gptneox_token_eos()) {
-                space = 0;
-                continue;
-            }
-            // Check for bos - skip callback if so
-            if (id == gptneox_token_bos()) {
-                continue;
-            }
-
-            if (last_n_tokens[2]==gptneox_str_to_token(ctx, "<")){
-                ;
-            }
-            else if (last_n_tokens[2]==gptneox_str_to_token(ctx, "human")){
-                if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){
-                    ;
-                }
-                else{
-                    printf("%s", gptneox_token_to_str(ctx, id));
-                }
-            }
-            else if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){
-                    printf("<");
-                    printf("%s", gptneox_token_to_str(ctx, id));
-                }
-            else{
-                printf("%s", gptneox_token_to_str(ctx, id));
-            }
-            fflush(stdout);
-            // Check if we need to run another eval
-            if (space > 0) {
-                // Send generated token back into model for next generation
-                if (gptneox_eval(ctx, &id, 1, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
-                    return 1;
-                }
-                // Increment past count
-                n_past += 1;
-            }
-            // Check for user interrupt
-            if (is_interacting) { space = 0; }
-        }
-        printf("\n");
-        fflush(stdout);
-    }
-
-    gptneox_print_timings(ctx);
-    gptneox_free(ctx);
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    return 0;
-}
--- a/third_party/radpajama/main-redpajama.cc
+++ b/third_party/radpajama/main-redpajama.cc
@ -1,657 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  radpajama.com                                                               │
-│  Copyright (c) 2023 Ariel Núñez                                              │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/calls/calls.h"
-#include "libc/calls/sigtimedwait.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/siginfo.h"
-#include "libc/calls/weirdtypes.h"
-#include "libc/log/log.h"
-#include "libc/runtime/pathconf.h"
-#include "libc/runtime/runtime.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/sysv/consts/f.h"
-#include "libc/sysv/consts/fileno.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/ok.h"
-#include "libc/sysv/consts/sa.h"
-#include "libc/sysv/consts/sicode.h"
-#include "libc/sysv/consts/sig.h"
-#include "libc/sysv/consts/ss.h"
-#include "libc/time/time.h"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cinttypes"
-#include "third_party/libcxx/cmath"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/ctime"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-#include "third_party/musl/crypt.h"
-#include "third_party/musl/lockf.h"
-#include "third_party/radpajama/common-gptneox.h"
-#include "third_party/radpajama/gptneox.h"
-
-static console_state con_st;
-static gptneox_context ** g_ctx;
-
-static bool is_interacting = false;
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-void sigint_handler(int signo) {
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    printf("\n"); // this also force flush stdout.
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting=true;
-        } else {
-            gptneox_print_timings(*g_ctx);
-            _exit(130);
-        }
-    }
-}
-#endif
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    params.model = "./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-f16.bin";
-
-    verynice();
-    ShowCrashReports();
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    con_st.use_color = params.use_color;
-
-#if defined (_WIN32)
-    win32_console_init(params.use_color);
-#endif
-
-    if (params.perplexity) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
-
-    gptneox_context * ctx;
-    g_ctx = &ctx;
-
-    // load the model
-    {
-        auto lparams = gptneox_context_default_params();
-
-        lparams.n_ctx      = params.n_ctx;
-        lparams.n_parts    = params.n_parts;
-        lparams.seed       = params.seed;
-        lparams.f16_kv     = params.memory_f16;
-        lparams.use_mmap   = params.use_mmap;
-        lparams.use_mlock  = params.use_mlock;
-
-        ctx = gptneox_init_from_file(params.model.c_str(), lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-    }
-
-    if (!params.lora_adapter.empty()) {
-        int err = gptneox_apply_lora_from_file(ctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), gptneox_print_system_info());
-    }
-
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            const std::vector<gptneox_token> tmp(params.n_batch, 0);
-            gptneox_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
-
-        {
-            const std::vector<gptneox_token> tmp = { 0, };
-            gptneox_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
-        }
-
-        gptneox_print_timings(ctx);
-        gptneox_free(ctx);
-
-        return 0;
-    }
-    
-    std::string path_session = params.path_session;
-    std::vector<gptneox_token> session_tokens;
-
-    if (!path_session.empty()) {
-        fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str());
-
-        // REVIEW - fopen to check for existing session
-        FILE * fp = std::fopen(path_session.c_str(), "rb");
-        if (fp != NULL) {
-            std::fclose(fp);
-
-            session_tokens.resize(params.n_ctx);
-            size_t n_token_count_out = 0;
-            const size_t n_session_bytes = gptneox_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out);
-            session_tokens.resize(n_token_count_out);
-
-            if (n_session_bytes > 0) {
-                fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes);
-            } else {
-                fprintf(stderr, "%s: could not load session file, will recreate\n", __func__);
-            }
-        } else {
-            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
-        }
-    }
-
-    // tokenize the prompt
-    auto embd_inp = ::gptneox_tokenize(ctx, params.prompt, false); //true);
-
-    const int n_ctx = gptneox_n_ctx(ctx);
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-    
-    // debug message about similarity of saved session, if applicable
-    size_t n_matching_session_tokens = 0;
-    if (session_tokens.size()) {
-        for (gptneox_token id : session_tokens) {
-            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
-                break;
-            }
-            n_matching_session_tokens++;
-        }
-        if (n_matching_session_tokens >= embd_inp.size()) {
-            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
-        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-        } else {
-            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
-                __func__, n_matching_session_tokens, embd_inp.size());
-        }
-    }
-
-    // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
-        params.n_keep = (int)embd_inp.size();
-    }
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_first = true;
-        params.antiprompt.push_back("<|prompter|>");
-    }
-
-    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_first) {
-        params.interactive = true;
-    }
-
-    // determine newline token
-    auto gptneox_token_newline = ::gptneox_tokenize(ctx, "\n", false);
-
-    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], gptneox_token_to_str(ctx, embd_inp[i]));
-        }
-        if (params.n_keep > 0) {
-        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
-            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", gptneox_token_to_str(ctx, embd_inp[i]));
-            }
-            fprintf(stderr, "'\n");
-        }
-        fprintf(stderr, "\n");
-    }
-
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        signal(SIGINT, sigint_handler);
-#endif
-
-        fprintf(stderr, "%s: interactive mode on.\n", __func__);
-
-        if (params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
-            }
-        }
-
-        if (!params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-    }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
-            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
-
-    // TODO: replace with ring-buffer
-    std::vector<gptneox_token> last_n_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    if (params.interactive) {
-        fprintf(stderr, "== Running in interactive mode. ==\n"
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
-#endif
-               " - Press Return to return control to RedPajama.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_first;
-    }
-
-    bool input_noecho  = false;
-    
-    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
-    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
-    // initial prompt so it doesn't need to be an exact match.
-    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
-
-
-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
-    int n_session_consumed = 0;
-
-    // the first thing we will do is to output the prompt, so set color accordingly
-    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
-
-    std::vector<gptneox_token> embd;
-
-    while (n_remain != 0 || params.interactive) {
-        // predict
-        if (embd.size() > 0) {
-            // infinite text generation via context swapping
-            // if we run out of context:
-            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() > n_ctx) {
-                const int n_left = n_past - params.n_keep;
-
-                n_past = params.n_keep;
-
-                // insert n_left/2 tokens at the start of embd from last_n_tokens
-                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
-                
-                // REVIEW - stop saving session if we run out of context
-                path_session = "";
-
-                //printf("\n---\n");
-                //printf("resetting: '");
-                //for (int i = 0; i < (int) embd.size(); i++) {
-                //    printf("%s", gptneox_token_to_str(ctx, embd[i]));
-                //}
-                //printf("'\n");
-                //printf("\n---\n");
-            }
-            
-            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
-            // REVIEW
-            if (n_session_consumed < (int) session_tokens.size()) {
-                size_t i = 0;
-                for ( ; i < embd.size(); i++) {
-                    if (embd[i] != session_tokens[n_session_consumed]) {
-                        session_tokens.resize(n_session_consumed);
-                        break;
-                    }
-
-                    n_past++;
-                    n_session_consumed++;
-
-                    if (n_session_consumed >= (int) session_tokens.size()) {
-                        break;
-                    }
-                }
-                if (i > 0) {
-                    embd.erase(embd.begin(), embd.begin() + i);
-                }
-            }
-
-            // evaluate tokens in batches
-            // embd is typically prepared beforehand to fit within a batch, but not always
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-                if (gptneox_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
-                    return 1;
-                }
-                n_past += n_eval;
-            }
-            
-            if (embd.size() > 0 && !path_session.empty()) {
-                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
-                n_session_consumed = session_tokens.size();
-            }
-        }
-
-        embd.clear();
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // out of user input, sample next token
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? gptneox_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty  = params.repeat_penalty;
-            const float   alpha_presence  = params.presence_penalty;
-            const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-            const bool    penalize_nl     = params.penalize_nl;
-
-            // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session) {
-                need_to_save_session = false;
-                gptneox_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-            }
-
-            gptneox_token id = 0;
-
-            {
-                auto logits = gptneox_get_logits(ctx);
-                auto n_vocab = gptneox_n_vocab(ctx);
-                
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<gptneox_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (gptneox_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(gptneox_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                gptneox_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-                gptneox_token nl_token = gptneox_str_to_token(ctx, "\n");
-                float nl_logit = logits[nl_token];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                gptneox_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                gptneox_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    logits[nl_token] = nl_logit;
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = gptneox_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        gptneox_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        gptneox_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        gptneox_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        gptneox_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token(ctx, &candidates_p);
-                    }
-                }
-                // printf("`%d`", candidates_p.size);
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-            }
-
-            // replace end of text token with newline token when in interactive mode
-            if (id == gptneox_token_eos() && params.interactive && !params.instruct) {
-                id = gptneox_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::gptneox_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            // echo this to console
-            input_noecho = false;
-
-            // decrement remaining sampling budget
-            --n_remain;
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed]);
-                ++n_consumed;
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-        }
-
-        // display text
-        if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", gptneox_token_to_str(ctx, id));
-            }
-            fflush(stdout);
-        }
-        // reset color to default if we there is no pending user input
-        if (!input_noecho && (int)embd_inp.size() == n_consumed) {
-            set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-        }
-
-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
-
-            // check for reverse prompt
-            if (params.antiprompt.size()) {
-                std::string last_output;
-                for (auto id : last_n_tokens) {
-                    last_output += gptneox_token_to_str(ctx, id);
-                }
-
-                // Check if each of the reverse prompts appears at the end of the output.
-                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                        is_interacting = true;
-                        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-                        fflush(stdout);
-                        break;
-                    }
-                }
-            }
-
-            if (n_past > 0 && is_interacting) {
-                // potentially set color to indicate we are taking user input
-                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-
-#if defined (_WIN32)
-                // Windows: must reactivate sigint handler after each signal
-                signal(SIGINT, sigint_handler);
-#endif
-
-                if (params.instruct) {
-                    printf("\n> ");
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-#if defined(_WIN32)
-                    std::wstring wline;
-                    if (!std::getline(std::wcin, wline)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-                    win32_utf8_encode(wline, line);
-#else
-                    if (!std::getline(std::cin, line)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-#endif
-                    if (line.empty() || line.back() != '\\') {
-                        another_line = false;
-                    } else {
-                        line.pop_back(); // Remove the continue character
-                    }
-                    buffer += line + '\n'; // Append the line to the result
-                } while (another_line);
-
-                // done taking input, reset color
-                set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-
-                    auto line_inp = ::gptneox_tokenize(ctx, buffer, false);
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-                    n_remain -= line_inp.size();
-                }
-
-                input_noecho = true; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                is_interacting = false;
-            }
-        }
-
-        // end of text token
-        if (!embd.empty() && embd.back() == gptneox_token_eos()) {
-            if (params.instruct) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-
-#if defined (_WIN32)
-    signal(SIGINT, SIG_DFL);
-#endif
-     printf("\n\n");
-    gptneox_print_timings(ctx);
-    gptneox_free(ctx);
-
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-    return 0;
-}
--- a/third_party/radpajama/quantize-gptneox.cc
+++ b/third_party/radpajama/quantize-gptneox.cc
@ -1,121 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│ vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                             :vi │
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  radpajama.com                                                               │
-│  Copyright (c) 2023 Ariel Núñez                                              │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/log/log.h"
-#include "third_party/ggml/ggml.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/map"
-#include "third_party/libcxx/string"
-#include "third_party/radpajama/gptneox.h"
-
-static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
-    {"f16", GPTNEOX_FTYPE_MOSTLY_F16},
-    {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
-    {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
-    {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
-    //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
-    {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
-    {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
-    {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
-};
-
-// usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    verynice();
-    ShowCrashReports();
-
-    ggjt_v2();
-    ggml_time_init();
-
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
-        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    if (fname_inp == fname_out) {
-        fprintf(stderr, "%s: input and output names are same\n", fname_inp.c_str());
-        exit(1);
-    }
-
-    enum gptneox_ftype ftype;
-    if (!is_integer_str(argv[3])) {
-        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
-        if (it == GPTNEOX_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum gptneox_ftype)atoi(argv[3]);
-    }
-
-    int nthread = argc > 4 ? atoi(argv[4]) : 0;
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (gptneox_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
-    }
-
-    return 0;
-}
--- a/third_party/radpajama/scripts/convert_gptneox_to_ggml.py
+++ b/third_party/radpajama/scripts/convert_gptneox_to_ggml.py
@ -1,144 +0,0 @@
-# Convert Hugging Face fine-tuned gpt-neox-like models to ggml format
-
-import io
-import os
-import sys
-import struct
-import json
-import code
-import torch
-import numpy as np
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-if len(sys.argv) < 3:
-    print("Usage: python convert-hf-to-ggml.py model_name dir-output [use-f32]")
-    print("  model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'")
-    print("  dir-output: directory where the output file will be written")
-    print("  use-f32:    if present, use float32 instead of float16")
-    sys.exit(1)
-
-model_name = sys.argv[1]
-dir_out = sys.argv[2]
-model_cache_dir = dir_out + "-cache"
-
-# make sure the output directory exists
-os.makedirs(dir_out, exist_ok=True)
-
-# possible data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-#
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-ftype = 1
-if len(sys.argv) > 3:
-    ftype = 0
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-print("Loading model: ", model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16 if ftype == 1 else torch.float32, 
-                                             cache_dir=model_cache_dir)
-model.eval()
-for p in model.parameters():
-    p.requires_grad = False
-hparams = model.config.to_dict()
-print("Model loaded: ", model_name)
-
-fn_bin = f"/ggml-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin"
-fn_out = dir_out + fn_bin
-fout = open(fn_out, "wb")
-
-ggml_file_magic = 0x67676d66 # 0x67676d6c is unversioned
-ggml_file_version = 0x00000001 # v1
-
-hparams["multiple_of"] = 1
-fout.write(struct.pack("i", ggml_file_magic)) # magic: ggmf in hex
-fout.write(struct.pack("i", ggml_file_version))
-fout.write(struct.pack("i", hparams["vocab_size"]))
-fout.write(struct.pack("i", hparams["max_position_embeddings"]))
-fout.write(struct.pack("i", hparams["hidden_size"]))
-fout.write(struct.pack("i", hparams["num_attention_heads"]))
-fout.write(struct.pack("i", hparams["num_hidden_layers"]))
-fout.write(struct.pack("i", int((hparams["hidden_size"] / hparams["num_attention_heads"]
-                             ) * hparams["rotary_pct"]))) # rotary_dim
-fout.write(struct.pack("i", int(hparams["use_parallel_residual"])))
-fout.write(struct.pack("i", ftype))
-
-# Is this correct??
-dot_token = tokenizer.encode(".")[0]
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([i]).encode('utf-8')
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-
-list_vars = model.state_dict()
-
-print(hparams)
-
-for name in list_vars.keys():
-    if name.startswith('gpt_neox.layers.'):
-        if 'attention.masked_bias' in name or \
-            'attention.rotary_emb.inv_freq' in name or \
-            'attention.bias' in name:
-            continue
-    # No gradients for these
-    list_vars[name].requires_grad = False
-    src = name
-    nn = name
-
-    print(src, ' -> ', name)
-    data = list_vars[src].squeeze().numpy()
-    data = data.astype(np.float32)
-
-    n_dims = len(data.shape)
-    print(name, n_dims, data.shape)
-
-    # default type is fp32
-    ftype_cur = 0
-    if ftype == 1 and n_dims > 1:
-        print("  Converting to float16", data.shape, data[:3, :3].tolist())
-        data = data.astype(np.float16)
-        ftype_cur = 1
-    else:
-        print("  Converting to float32", data.shape,
-              data[:3, :3].tolist() if n_dims > 1 else data[:3].tolist())
-        data = data.astype(np.float32)
-
-    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
-    for i in range(n_dims):
-        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    print(str)
-    fout.write(str)
-
-    # data
-    data.tofile(fout)
-
-fout.close()
-
-print("Done. Output file: " + fn_out)
-print("")
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-
-# cd to scripts dir
-cd `dirname $0`
-
-# download model to models dir
-echo "Downloading model"
-python ./convert_gptneox_to_ggml.py togethercomputer/RedPajama-INCITE-Base-3B-v1 ../models/pythia
-
-# remove temp cache dir
-echo "Removing temp cache dir"
-rm -r ../models/pythia-cache
-
-# quantize model
-echo "Quantizing model (q4_0)"
-cd ../../..
-python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Base-3B-v1-f16.bin
-
-
-# done!
-echo "Done."
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-
-# cd to scripts dir
-cd `dirname $0`
-
-# download model to models dir
-echo "Downloading model"
-python ./convert_gptneox_to_ggml.py togethercomputer/RedPajama-INCITE-Chat-3B-v1 ../models/pythia
-
-# remove temp cache dir
-echo "Removing temp cache dir"
-rm -r ../models/pythia-cache
-
-# quantize model
-echo "Quantizing model (q4_0)"
-cd ../../..
-python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin
-
-
-# done!
-echo "Done."
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
@ -1,21 +0,0 @@
-#!/bin/bash
-
-# cd to scripts dir
-cd `dirname $0`
-
-# download model to models dir
-echo "Downloading model"
-python ./convert_gptneox_to_ggml.py togethercomputer/RedPajama-INCITE-Instruct-3B-v1 ../models/pythia
-
-# remove temp cache dir
-echo "Removing temp cache dir"
-rm -r ../models/pythia-cache
-
-# quantize model
-echo "Quantizing model (q4_0)"
-cd ../../..
-python ./examples/redpajama/scripts/quantize-gptneox.py ./examples/redpajama/models/pythia/ggml-RedPajama-INCITE-Instruct-3B-v1-f16.bin
-
-
-# done!
-echo "Done."
--- a/third_party/radpajama/scripts/quantize-gptneox.py
+++ b/third_party/radpajama/scripts/quantize-gptneox.py
@ -1,141 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
-    """Update the quantize binary name depending on the platform and parse
-    the command line arguments and execute the script.
-    """
-
-    if "linux" in sys.platform or "darwin" in sys.platform:
-        quantize_script_binary = "quantize-gptneox"
-
-    elif "win32" in sys.platform or "cygwin" in sys.platform:
-        quantize_script_binary = "quantize-gptneox.exe"
-
-    else:
-        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
-        quantize_script_binary = "quantize-gptneox"
-
-    parser = argparse.ArgumentParser(
-        prog='python3 quantize-gptneox.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
-    )
-    parser.add_argument('model_path')
-    #parser.add_argument(
-    #    'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-    #    help='The models to quantize.'
-    #)
-    parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
-    )
-    #parser.add_argument(
-    #    '-m', '--models-path', dest='models_path',
-    #    default=os.path.join(os.getcwd(), "models"),
-    #    help='Specify the directory where the models are located.'
-    #)
-    parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
-        default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
-    )
-
-    parser.add_argument(
-        '--quantize-output-type', dest='quantize_output_type', type=str,
-        default='q4_0',
-        help='Specify the path to the "quantize" script.'
-    )
-
-
-    # TODO: Revise this code
-    # parser.add_argument(
-    #     '-t', '--threads', dest='threads', type='int',
-    #     default=os.cpu_count(),
-    #     help='Specify the number of threads to use to quantize many models at '
-    #     'once. Defaults to os.cpu_count().'
-    # )
-
-    args = parser.parse_args()
-    args.model_path = os.path.abspath(args.model_path)
-    #args.models_path = os.path.abspath(args.models_path)
-
-    if not os.path.isfile(args.quantize_script_path):
-        print(
-            f'The "{quantize_script_binary}" script was not found in the '
-            "current location.\nIf you want to use it from another location, "
-            "set the --quantize-script-path argument from the command line."
-        )
-        sys.exit(1)
-
-    #for model in args.models:
-    # The model is separated in various parts
-    # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
-    #f16_model_path_base = os.path.join(
-    #    args.models_path, model, "ggml-model-f16.bin"
-    #)
-    f16_model_path_base = args.model_path
-
-    if not os.path.isfile(f16_model_path_base):
-        print(f'The file %s was not found' % f16_model_path_base)
-        sys.exit(1)
-
-    f16_model_parts_paths = map(
-        lambda filename: os.path.join(f16_model_path_base, filename),
-        glob.glob(f"{f16_model_path_base}*")
-    )
-
-    for f16_model_part_path in f16_model_parts_paths:
-        if not os.path.isfile(f16_model_part_path):
-            print(
-                f"The f16 model {os.path.basename(f16_model_part_path)} "
-                f"was not found in {args.models_path}{os.path.sep}"
-                ". If you want to use it from another location, set the "
-                "--models-path argument from the command line."
-            )
-            sys.exit(1)
-
-        __run_quantize_script(
-            args.quantize_script_path, f16_model_part_path, args.quantize_output_type
-        )
-
-        if args.remove_f16:
-            os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path, quantize_output_type):
-    """Run the quantize script specifying the path to it and the path to the
-    f16 model to quantize.
-    """
-
-    new_quantized_model_path = f16_model_part_path.replace("f16", quantize_output_type)
-    subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, quantize_output_type],
-        check=True
-    )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-
-    except subprocess.CalledProcessError:
-        print("\nAn error ocurred while trying to quantize the models.")
-        sys.exit(1)
-
-    except KeyboardInterrupt:
-        sys.exit(0)
-
-    else:
-        print("\nSuccesfully quantized all models.")