From 282dd8e7b73dfaba7e304b188f7e80f4a71329f4 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Sat, 13 May 2023 20:44:36 -0700
Subject: [PATCH] Get radpajama to build

    make -j8 o//third_party/radpajama/radpajama.com
    make -j8 o//third_party/radpajama/radpajama-chat.com

This change gets the radpajama.mk config working. This package depends
on THIRD_PARTY_GGML but it's configured to call ggjt_v1(), so that the
library will provide the old quantizers. The ggml_quantize_chunk() API
will now dispatch to older quantizers based on the configured version.
---
 Makefile                                      |     2 +-
 third_party/ggml/common.cc                    |    26 +-
 third_party/ggml/ggjt.v1.c                    |    10 +
 third_party/ggml/ggml.c                       |    79 +-
 third_party/ggml/ggml.h                       |    13 +-
 third_party/make/job.c                        |     6 +-
 third_party/quickjs/quickjs.mk                |    13 +-
 third_party/radpajama/README.md               |     2 +-
 .../{common-gptneox.cpp => common-gptneox.cc} |   125 +-
 third_party/radpajama/common-gptneox.h        |    27 +-
 third_party/radpajama/common.cc               |   835 -
 third_party/radpajama/common.h                |   136 -
 third_party/radpajama/companionai.txt         |    54 -
 third_party/radpajama/copy-gptneox.cc         |    85 +
 third_party/radpajama/copy-gptneox.cpp        |    57 -
 third_party/radpajama/ggml.c                  | 13394 ----------------
 third_party/radpajama/ggml.h                  |   931 --
 third_party/radpajama/ggml.mk                 |   141 -
 third_party/radpajama/gptneox-util.h          |    95 +-
 .../radpajama/{gptneox.cpp => gptneox.cc}     |   171 +-
 third_party/radpajama/gptneox.h               |     9 +-
 third_party/radpajama/llama.h                 |   256 -
 third_party/radpajama/llama_util.h            |   389 -
 ...pajama-chat.cpp => main-redpajama-chat.cc} |    87 +-
 .../{main-redpajama.cpp => main-redpajama.cc} |    90 +-
 third_party/radpajama/main.cc                 |   891 -
 third_party/radpajama/quantize-gptneox.cc     |   110 +
 third_party/radpajama/quantize-gptneox.cpp    |    82 -
 third_party/radpajama/radpajama.cc            |  3395 ----
 third_party/radpajama/radpajama.mk            |   184 +-
 .../scripts/convert_gptneox_to_ggml.py        |     1 +
 .../install-RedPajama-INCITE-Base-3B-v1.sh    |     1 +
 .../install-RedPajama-INCITE-Chat-3B-v1.sh    |     1 +
 ...install-RedPajama-INCITE-Instruct-3B-v1.sh |     1 +
 .../radpajama/scripts/quantize-gptneox.py     |     1 +
 third_party/third_party.mk                    |     2 +-
 36 files changed, 707 insertions(+), 20995 deletions(-)
 rename third_party/radpajama/{common-gptneox.cpp => common-gptneox.cc} (81%)
 delete mode 100644 third_party/radpajama/common.cc
 delete mode 100644 third_party/radpajama/common.h
 delete mode 100644 third_party/radpajama/companionai.txt
 create mode 100644 third_party/radpajama/copy-gptneox.cc
 delete mode 100644 third_party/radpajama/copy-gptneox.cpp
 delete mode 100644 third_party/radpajama/ggml.c
 delete mode 100644 third_party/radpajama/ggml.h
 delete mode 100644 third_party/radpajama/ggml.mk
 rename third_party/radpajama/{gptneox.cpp => gptneox.cc} (94%)
 delete mode 100644 third_party/radpajama/llama.h
 delete mode 100755 third_party/radpajama/llama_util.h
 rename third_party/radpajama/{main-redpajama-chat.cpp => main-redpajama-chat.cc} (81%)
 rename third_party/radpajama/{main-redpajama.cpp => main-redpajama.cc} (86%)
 delete mode 100644 third_party/radpajama/main.cc
 create mode 100644 third_party/radpajama/quantize-gptneox.cc
 delete mode 100644 third_party/radpajama/quantize-gptneox.cpp
 delete mode 100644 third_party/radpajama/radpajama.cc

diff --git a/Makefile b/Makefile
index 82805239d..4ead08582 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ include net/net.mk				# │
 include third_party/vqsort/vqsort.mk		# │
 include libc/log/log.mk				# │
 include third_party/ggml/ggml.mk		# │
-#include third_party/radpajama/radpajama.mk	# │
+include third_party/radpajama/radpajama.mk	# │
 include third_party/bzip2/bzip2.mk		# │
 include dsp/core/core.mk			# │
 include libc/x/x.mk				# │
diff --git a/third_party/ggml/common.cc b/third_party/ggml/common.cc
index b4270e60c..ed4c1fc6b 100644
--- a/third_party/ggml/common.cc
+++ b/third_party/ggml/common.cc
@@ -60,20 +60,20 @@ static bool is_integer_str(const char *s) {
 static std::string replace_all(std::string const& original,
                                std::string const& before,
                                std::string const& after) {
-  // https://stackoverflow.com/a/7724536/1653720
-  std::string retval;
-  std::string::const_iterator end = original.end();
-  std::string::const_iterator current = original.begin();
-  std::string::const_iterator next =
-      std::search(current, end, before.begin(), before.end());
-  while (next != end) {
+    // https://stackoverflow.com/a/7724536/1653720
+    std::string retval;
+    std::string::const_iterator end = original.end();
+    std::string::const_iterator current = original.begin();
+    std::string::const_iterator next =
+            std::search(current, end, before.begin(), before.end());
+    while (next != end) {
+        retval.append(current, next);
+        retval.append(after);
+        current = next + before.size();
+        next = std::search(current, end, before.begin(), before.end());
+    }
     retval.append(current, next);
-    retval.append(after);
-    current = next + before.size();
-    next = std::search(current, end, before.begin(), before.end());
-  }
-  retval.append(current, next);
-  return retval;
+    return retval;
 }
 
 static bool append_file_to_prompt(const char *path, gpt_params & params) {
diff --git a/third_party/ggml/ggjt.v1.c b/third_party/ggml/ggjt.v1.c
index c1c141b6e..d242e569f 100644
--- a/third_party/ggml/ggjt.v1.c
+++ b/third_party/ggml/ggjt.v1.c
@@ -87,6 +87,15 @@ static const bool ggjt_v1_is_quantized[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I32]  = false,
 };
 
+static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_Q4_0] = (void *)ggml_quantize_v1_q4_0,
+    [GGML_TYPE_Q4_1] = (void *)ggml_quantize_v1_q4_1,
+    [GGML_TYPE_Q4_2] = (void *)ggml_quantize_v1_q4_2,
+    [GGML_TYPE_Q5_0] = (void *)ggml_quantize_v1_q5_0,
+    [GGML_TYPE_Q5_1] = (void *)ggml_quantize_v1_q5_1,
+    [GGML_TYPE_Q8_0] = (void *)ggml_quantize_v1_q8_0,
+};
+
 static const quantize_fns_t ggjt_v1_quantize_fns[GGML_TYPE_COUNT] = {
     [GGML_TYPE_Q4_0] = {
         .dequantize_row_q         = dequantize_row_v1_q4_0,
@@ -152,4 +161,5 @@ void ggjt_v1(void) {
     GGML_TYPE_NAME = ggjt_v1_type_name;
     GGML_IS_QUANTIZED = ggjt_v1_is_quantized;
     quantize_fns = ggjt_v1_quantize_fns;
+    GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk;
 }
diff --git a/third_party/ggml/ggml.c b/third_party/ggml/ggml.c
index 4e75185df..b0c375612 100644
--- a/third_party/ggml/ggml.c
+++ b/third_party/ggml/ggml.c
@@ -46,6 +46,8 @@
 #include "third_party/ggml/fp16.h"
 #include "third_party/ggml/fp16.internal.h"
 #include "libc/assert.h"
+#include "libc/assert.h"
+#include "third_party/ggml/ggml.h"
 #include "third_party/libcxx/math.h"
 
 asm(".ident\t\"\\n\\n\
@@ -2810,14 +2812,6 @@ const bool ggjt_v2_is_quantized[GGML_TYPE_COUNT] = {
 const bool *GGML_IS_QUANTIZED;
 static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
 
-void ggjt_v2(void) {
-  GGML_BLCK_SIZE = ggjt_v2_blck_size;
-  GGML_TYPE_SIZE = ggjt_v2_type_size;
-  GGML_TYPE_NAME = ggjt_v2_type_name;
-  GGML_IS_QUANTIZED = ggjt_v2_is_quantized;
-  quantize_fns = ggjt_v2_quantize_fns;
-}
-
 static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
     "NONE",
 
@@ -11919,43 +11913,40 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
+static const quantize_chunk_f *const ggjt_v2_quantize_chunk[GGML_TYPE_COUNT] = {
+    [GGML_TYPE_Q4_0] = ggml_quantize_q4_0,
+    [GGML_TYPE_Q4_1] = ggml_quantize_q4_1,
+    [GGML_TYPE_Q5_0] = ggml_quantize_q5_0,
+    [GGML_TYPE_Q5_1] = ggml_quantize_q5_1,
+    [GGML_TYPE_Q8_0] = ggml_quantize_q8_0,
+};
+const quantize_chunk_f *const *GGML_QUANTIZE_CHUNK;
+static_assert(GGML_TYPE_COUNT == 13, "GGML_QUANTIZE_CHUNK is outdated");
+
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
-    size_t result = 0;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            {
-                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_1:
-            {
-                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_0:
-            {
-                GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_1:
-            {
-                GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q8_0:
-            {
-                GGML_ASSERT(start % QK8_0 == 0);
-                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
-                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
-            } break;
-        default:
-            assert(false);
-    }
-    return result;
+    void *block;
+    int blcksize, typesize;
+    GGML_ASSERT(GGML_QUANTIZE_CHUNK);
+    GGML_ASSERT((unsigned)type < GGML_TYPE_COUNT);
+    GGML_ASSERT(GGML_QUANTIZE_CHUNK[type]);
+    GGML_ASSERT(GGML_BLCK_SIZE[type]);
+    GGML_ASSERT(GGML_TYPE_SIZE[type]);
+    GGML_ASSERT(start % GGML_BLCK_SIZE[type] == 0);
+    blcksize = GGML_BLCK_SIZE[type];
+    typesize = GGML_TYPE_SIZE[type];
+    block = (char *)dst + start / blcksize * typesize;
+    return GGML_QUANTIZE_CHUNK[type](src + start, block, n, n, hist);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+void ggjt_v2(void) {
+    GGML_BLCK_SIZE = ggjt_v2_blck_size;
+    GGML_TYPE_SIZE = ggjt_v2_type_size;
+    GGML_TYPE_NAME = ggjt_v2_type_name;
+    GGML_IS_QUANTIZED = ggjt_v2_is_quantized;
+    quantize_fns = ggjt_v2_quantize_fns;
+    GGML_QUANTIZE_CHUNK = ggjt_v2_quantize_chunk;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/ggml/ggml.h b/third_party/ggml/ggml.h
index 98ae577e1..c8f4dc138 100644
--- a/third_party/ggml/ggml.h
+++ b/third_party/ggml/ggml.h
@@ -863,6 +863,9 @@ COSMOPOLITAN_C_START_
     // quantization
     //
 
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+
+    // NOTE: These quant APIs will always use the newest version (even if ggjt_v1() was called)
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -870,8 +873,6 @@ COSMOPOLITAN_C_START_
     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
 
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
-
     //
     // system info
     //
@@ -904,9 +905,10 @@ COSMOPOLITAN_C_START_
 #else
 #define GGML_RESTRICT restrict
 #endif
-    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void   (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void   (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+    typedef size_t quantize_chunk_f     (const float * src, void * dst, int n, int k, int64_t * hist);
+    typedef void   (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
     typedef struct {
         dequantize_row_q_t dequantize_row_q;
@@ -924,6 +926,7 @@ COSMOPOLITAN_C_START_
     extern const bool *GGML_IS_QUANTIZED;
     extern const char *const *GGML_TYPE_NAME;
     extern const quantize_fns_t *quantize_fns;
+    extern const quantize_chunk_f *const *GGML_QUANTIZE_CHUNK;
 
 COSMOPOLITAN_C_END_
 #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
diff --git a/third_party/make/job.c b/third_party/make/job.c
index c313a715a..fda914ef0 100644
--- a/third_party/make/job.c
+++ b/third_party/make/job.c
@@ -1810,7 +1810,7 @@ unveil_variable (const struct variable *var)
   return -1;
 }
 
-int
+static int
 get_base_cpu_freq_mhz (void)
 {
   return KCPUIDS(16H, EAX) & 0x7fff;
@@ -1830,7 +1830,7 @@ set_limit (int r, long lo, long hi)
   return setrlimit (r, &lim);
 }
 
-int
+static int
 set_cpu_limit (int secs)
 {
   int mhz, lim;
@@ -1954,6 +1954,7 @@ child_execute_job (struct childbase *child,
        strict ? " in .STRICT mode" : "",
        internet ? " with internet access" : ""));
 
+#ifdef __x86_64__
   /* [jart] Set cpu seconds quota.  */
   if (RLIMIT_CPU < RLIM_NLIMITS &&
       (s = get_target_variable (STRING_SIZE_TUPLE (".CPU"),
@@ -1966,6 +1967,7 @@ child_execute_job (struct childbase *child,
       else
         DB (DB_JOBS, (_("Failed to set CPU limit: %s\n"), strerror (errno)));
     }
+#endif /* __x86_64__ */
 
   /* [jart] Set virtual memory quota.  */
   if (RLIMIT_AS < RLIM_NLIMITS &&
diff --git a/third_party/quickjs/quickjs.mk b/third_party/quickjs/quickjs.mk
index 6256e6fc5..076f47e1d 100644
--- a/third_party/quickjs/quickjs.mk
+++ b/third_party/quickjs/quickjs.mk
@@ -137,13 +137,15 @@ THIRD_PARTY_QUICKJS_CHECKS =							\
 
 o/$(MODE)/third_party/quickjs/qjscalc.c:					\
 		third_party/quickjs/qjscalc.js					\
-		o/$(MODE)/third_party/quickjs/qjsc.com
-	@$(COMPILE) -wAQJSC o/$(MODE)/third_party/quickjs/qjsc.com -fbignum -o $@ -c $<
+		o/$(MODE)/third_party/quickjs/qjsc.com				\
+		$(VM)
+	@$(COMPILE) -wAQJSC $(VM) o/$(MODE)/third_party/quickjs/qjsc.com -fbignum -o $@ -c $<
 
 o/$(MODE)/third_party/quickjs/repl.c:						\
 		third_party/quickjs/repl.js					\
-		o/$(MODE)/third_party/quickjs/qjsc.com
-	@$(COMPILE) -wAQJSC o/$(MODE)/third_party/quickjs/qjsc.com -o $@ -m -c $<
+		o/$(MODE)/third_party/quickjs/qjsc.com				\
+		$(VM)
+	@$(COMPILE) -wAQJSC $(VM) o/$(MODE)/third_party/quickjs/qjsc.com -o $@ -m -c $<
 
 o/$(MODE)/third_party/quickjs/qjs.com.dbg:					\
 		$(THIRD_PARTY_QUICKJS)						\
@@ -157,7 +159,8 @@ o/$(MODE)/third_party/quickjs/qjs.com.dbg:					\
 o/$(MODE)/third_party/quickjs/qjs.com:						\
 		o/$(MODE)/third_party/quickjs/qjs.com.dbg			\
 		o/$(MODE)/third_party/zip/zip.com				\
-		o/$(MODE)/tool/build/symtab.com
+		o/$(MODE)/tool/build/symtab.com					\
+		$(VM)
 	@$(MAKE_OBJCOPY)
 	@$(MAKE_SYMTAB_CREATE)
 	@$(MAKE_SYMTAB_ZIP)
diff --git a/third_party/radpajama/README.md b/third_party/radpajama/README.md
index 68054411f..39b00238a 100644
--- a/third_party/radpajama/README.md
+++ b/third_party/radpajama/README.md
@@ -140,4 +140,4 @@ gptneox-util.h
 gptneox.h
 convert_gptneox_to_ggml.py
 quantize-gptneox.py
-```
\ No newline at end of file
+```
diff --git a/third_party/radpajama/common-gptneox.cpp b/third_party/radpajama/common-gptneox.cc
similarity index 81%
rename from third_party/radpajama/common-gptneox.cpp
rename to third_party/radpajama/common-gptneox.cc
index a522f0837..4a6b7ac02 100644
--- a/third_party/radpajama/common-gptneox.cpp
+++ b/third_party/radpajama/common-gptneox.cc
@@ -1,42 +1,45 @@
-#include "common-gptneox.h"
-
-#include <cassert>
-#include <cstring>
-#include <fstream>
-#include <string>
-#include <iterator>
-#include <algorithm>
-#include <sstream>
-#include <iostream>
-
-#if defined (_WIN32)
-#include <fcntl.h>
-#include <io.h>
-#pragma comment(lib,"kernel32.lib")
-extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
-extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
-extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
-                                                                   const wchar_t * lpWideCharStr, int cchWideChar,
-                                                                   char * lpMultiByteStr, int cbMultiByte,
-                                                                   const char * lpDefaultChar, bool * lpUsedDefaultChar);
-#define CP_UTF8 65001
-#endif
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/radpajama/common-gptneox.h"
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cassert"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/iterator"
+#include "third_party/libcxx/sstream"
+#include "third_party/libcxx/string"
+// clang-format off
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    // determine sensible default number of threads.
-    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
-#ifdef __linux__
-    std::ifstream cpuinfo("/proc/cpuinfo");
-    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
-                                  std::istream_iterator<std::string>(),
-                                  std::string("processor"));
-#endif
-    if (params.n_threads == 0) {
-        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
-    }
+    params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75)));
 
     bool invalid_param = false;
     std::string arg;
@@ -238,16 +241,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             std::stringstream ss(argv[i]);
-            gptneox_token key;
-            char sign;
+            gptneox_token key = 0;
+            char sign = 0;
             std::string value_str;
-            try {
-                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                    params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                } else {
-                    throw std::exception();
-                }
-            } catch (const std::exception &e) {
+            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
+                params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
+            } else {
                 invalid_param = true;
                 break;
             }
@@ -393,37 +392,3 @@ void set_console_color(console_state & con_st, console_color_t color) {
         con_st.color = color;
     }
 }
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color) {
-    unsigned long dwMode = 0;
-    void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
-    if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
-        hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
-        if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
-            hConOut = 0;
-        }
-    }
-    if (hConOut) {
-        // Enable ANSI colors on Windows 10+
-        if (enable_color && !(dwMode & 0x4)) {
-            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
-        }
-        // Set console output codepage to UTF8
-        SetConsoleOutputCP(CP_UTF8);
-    }
-    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
-    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF16
-        _setmode(_fileno(stdin), _O_WTEXT);
-    }
-}
-
-// Convert a wide Unicode string to an UTF8 string
-void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
-    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
-    std::string strTo(size_needed, 0);
-    WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
-    str = strTo;
-}
-#endif
diff --git a/third_party/radpajama/common-gptneox.h b/third_party/radpajama/common-gptneox.h
index 81c25494e..21cdc66b4 100644
--- a/third_party/radpajama/common-gptneox.h
+++ b/third_party/radpajama/common-gptneox.h
@@ -1,22 +1,24 @@
+#ifndef COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
+#define COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_
+#include "libc/macros.internal.h"
+#include "libc/runtime/runtime.h"
+#include "third_party/libcxx/random"
+#include "third_party/libcxx/string"
+#include "third_party/libcxx/thread"
+#include "third_party/libcxx/unordered_map"
+#include "third_party/libcxx/vector"
+#include "third_party/radpajama/gptneox.h"
+#if !(__ASSEMBLER__ + __LINKER__ + 0)
+// clang-format off
 // Various helper functions and utilities
 
-#pragma once
-
-#include "gptneox.h"
-
-#include <string>
-#include <vector>
-#include <random>
-#include <thread>
-#include <unordered_map>
-
 //
 // CLI argument parsing
 //
 
 struct gpt_params {
     int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads     = MIN(4, (int32_t) _getcpucount() * 0.75);
     int32_t n_predict     = 128;  // new tokens to predict
     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
     int32_t n_ctx         = 512;  // context size
@@ -106,3 +108,6 @@ void set_console_color(console_state & con_st, console_color_t color);
 void win32_console_init(bool enable_color);
 void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
+
+#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
+#endif /* COSMOPOLITAN_THIRD_PARTY_RADPAJAMA_COMMON_GPTNEOX_H_ */
diff --git a/third_party/radpajama/common.cc b/third_party/radpajama/common.cc
deleted file mode 100644
index 83bbde5f5..000000000
--- a/third_party/radpajama/common.cc
+++ /dev/null
@@ -1,835 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Justine Alexandra Roberts Tunney                         │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/common.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/termios.h"
-#include "libc/calls/termios.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "libc/str/str.h"
-#include "libc/sysv/consts/fileno.h"
-#include "third_party/libcxx/algorithm"
-#include "third_party/libcxx/cassert"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/fstream"
-#include "third_party/libcxx/iterator"
-#include "third_party/libcxx/sstream"
-#include "third_party/libcxx/string"
-
-STATIC_YOINK("zipos");
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-static bool is_integer_str(const char *s) {
-    if (*s == '-') ++s;
-    if (!*s) return false;
-    while (isdigit(*s)) ++s;
-    return !*s;
-}
-
-static std::string replace_all(std::string const& original,
-                               std::string const& before,
-                               std::string const& after) {
-  // https://stackoverflow.com/a/7724536/1653720
-  std::string retval;
-  std::string::const_iterator end = original.end();
-  std::string::const_iterator current = original.begin();
-  std::string::const_iterator next =
-      std::search(current, end, before.begin(), before.end());
-  while (next != end) {
-    retval.append(current, next);
-    retval.append(after);
-    current = next + before.size();
-    next = std::search(current, end, before.begin(), before.end());
-  }
-  retval.append(current, next);
-  return retval;
-}
-
-static bool append_file_to_prompt(const char *path, gpt_params & params) {
-    std::ifstream file(path);
-    if (!file) {
-        fprintf(stderr, "error: failed to open file '%s'\n", path);
-        return false;
-    }
-    std::copy(std::istreambuf_iterator<char>(file),
-              std::istreambuf_iterator<char>(),
-              back_inserter(params.prompt));
-    if (params.prompt.back() == '\n') {
-        params.prompt.pop_back();
-    }
-    return true;
-}
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    params.n_threads = std::min(20, std::max(1, (int)(_getcpucount() * 0.75)));
-
-    bool invalid_param = false;
-    std::string arg;
-    gpt_params default_params;
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-s" || arg == "--seed") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.seed = std::stoi(argv[i]);
-        } else if (arg == "-v" || arg == "--verbose") {
-            ++params.verbose;
-        } else if (arg == "-t" || arg == "--threads") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt = argv[i];
-        } else if (arg == "-C" || arg == "--prompt_cache") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.prompt_path = argv[i];
-        } else if (arg == "-f" || arg == "--file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (!append_file_to_prompt(argv[i], params)) {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "-n" || arg == "--n_predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top_k") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f32") {
-            params.memory_f16 = false;
-        } else if (arg == "--top_p") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.top_p = std::stof(argv[i]);
-        } else if (arg == "--temp") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.temp = std::stof(argv[i]);
-        } else if (arg == "--repeat_last_n") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_last_n = std::stoi(argv[i]);
-        } else if (arg == "--repeat_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--frequency_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.frequency_penalty = std::stof(argv[i]);
-        } else if (arg == "--presence_penalty") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.presence_penalty = std::stof(argv[i]);
-        } else if (arg == "--mirostat") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat_lr") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat_ent") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
-        } else if (arg == "--keep") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_keep_str = argv[i];
-            if (is_integer_str(argv[i])) {
-                params.n_keep = std::stoi(params.n_keep_str);
-                if (!params.n_keep) {
-                    params.n_keep_str = "";
-                }
-            }
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter = argv[i];
-            params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        } else if (arg == "-i" || arg == "--interactive") {
-            params.interactive = true;
-        } else if (arg == "--embedding") {
-            params.embedding = true;
-        } else if (arg == "--interactive-first") {
-            params.interactive_first = true;
-        } else if (arg == "-ins" || arg == "--instruct") {
-            params.instruct = true;
-        } else if (arg == "--multiline-input") {
-            params.multiline_input = true;
-        } else if (arg == "--color") {
-            params.use_color = true;
-        } else if (arg == "--mlock") {
-            params.use_mlock = true;
-        } else if (arg == "--no-mmap") {
-            params.use_mmap = false;
-        } else if (arg == "--mtest") {
-            params.mem_test = true;
-        } else if (arg == "--verbose-prompt") {
-            params.verbose_prompt = true;
-        } else if (arg == "-r" || arg == "--reverse-prompt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.antiprompt.push_back(argv[i]);
-        } else if (arg == "--perplexity") {
-            params.perplexity = true;
-        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
-        } else if (arg == "--no-penalize-nl") {
-            params.penalize_nl = false;
-        } else if (arg == "-l" || arg == "--logit-bias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::stringstream ss(argv[i]);
-            llama_token key = 0;
-            char sign = 0;
-            std::string value_str;
-            if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
-                params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
-        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(stdout, argc, argv, default_params);
-            exit(0);
-        } else if (arg == "--random-prompt") {
-            params.random_prompt = true;
-        } else if (arg == "--in-prefix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_prefix = argv[i];
-        } else if (arg == "--in-suffix") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.input_suffix = argv[i];
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(stderr, argc, argv, default_params);
-            exit(1);
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(stderr, argc, argv, default_params);
-        exit(1);
-    }
-
-    // if no prompt is specified, then use companion ai
-    if (params.prompt.empty()) {
-        if (params.verbose) {
-            fprintf(stderr, "%s: No prompt specified\n", __func__);
-            fprintf(stderr, "%s: Loading CompanionAI\n", __func__);
-        }
-        append_file_to_prompt("/zip/companionai.txt", params);
-        const char *user;
-        user = getenv("USER");
-        if (!user || !*user) {
-            user = "Cosmo";
-        }
-        params.prompt = replace_all(params.prompt, "USER_NAME", user);
-        std::string user_prompt;
-        user_prompt.append(user);
-        user_prompt.append(":");
-        params.logit_bias[llama_token_eos()] = -INFINITY;
-        params.antiprompt.push_back(user_prompt);
-        params.repeat_penalty = 1.17647;
-        params.repeat_last_n = 256;
-        params.interactive = true;
-        params.n_predict = -1;
-        params.n_ctx = 2048;
-        params.n_keep = 0;
-        params.n_keep_str = "\n\n\n";
-        params.top_k = 40;
-        params.top_p = .5;
-        params.temp = 0.4;
-    }
-
-    return true;
-}
-
-void gpt_print_usage(FILE *f, int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(f, "usage: %s [options]\n", argv[0]);
-    fprintf(f, "\n");
-    fprintf(f, "options:\n");
-    fprintf(f, "  -h, --help            show this help message and exit\n");
-    fprintf(f, "  -v, --verbose         print plenty of helpful information, e.g. prompt\n");
-    fprintf(f, "  -i, --interactive     run in interactive mode\n");
-    fprintf(f, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(f, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    fprintf(f, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    fprintf(f, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(f, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
-    fprintf(f, "                        specified more than once for multiple prompts).\n");
-    fprintf(f, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(f, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    fprintf(f, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(f, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(f, "                        prompt to start generation with (default: Companion AI)\n");
-    fprintf(f, "  --random-prompt       start with a randomized prompt.\n");
-    fprintf(f, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(f, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    fprintf(f, "  -f FNAME, --file FNAME\n");
-    fprintf(f, "                        text file containing prompt (default: Companion AI)\n");
-    fprintf(f, "  -C FNAME, --prompt_cache FNAME\n");
-    fprintf(f, "                        path of cache for fast prompt reload (default: .prompt.jtlp)\n");
-    fprintf(f, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(f, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(f, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    fprintf(f, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    fprintf(f, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(f, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(f, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(f, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(f, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    fprintf(f, "  --mirostat N          use Mirostat sampling.\n");
-    fprintf(f, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    fprintf(f, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(f, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(f, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    fprintf(f, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    fprintf(f, "                        modifies the likelihood of token appearing in the completion,\n");
-    fprintf(f, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    fprintf(f, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(f, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
-    fprintf(f, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
-    fprintf(f, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(f, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    fprintf(f, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(f, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
-    fprintf(f, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(f, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
-    fprintf(f, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(f, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(f, "  --keep NUM|STR        number of tokens to keep from the initial prompt, or substring\n");
-    fprintf(f, "                        to search for within prompt that divides the actual prompt from\n");
-    fprintf(f, "                        its initial example text (default: %d, -1 = all)\n", params.n_keep);
-    if (llama_mlock_supported()) {
-        fprintf(f, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_mmap_supported()) {
-        fprintf(f, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    fprintf(f, "  --mtest               compute maximum memory usage\n");
-    fprintf(f, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(f, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(f, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(f, "  -m FNAME, --model FNAME\n");
-    fprintf(f, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(f, "\n");
-}
-
-std::string gpt_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-        default: return "To";
-    }
-    return "The";
-}
-
-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-    return res;
-}
-
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
-    auto lparams = llama_context_default_params();
-
-    lparams.n_ctx      = params.n_ctx;
-    lparams.n_parts    = params.n_parts;
-    lparams.seed       = params.seed;
-    lparams.f16_kv     = params.memory_f16;
-    lparams.use_mmap   = params.use_mmap;
-    lparams.use_mlock  = params.use_mlock;
-    lparams.logits_all = params.perplexity;
-    lparams.embedding  = params.embedding;
-
-    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams, params.verbose);
-
-    if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return NULL;
-    }
-
-    if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(lctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return NULL;
-        }
-    }
-
-    return lctx;
-}
-
-void console_init(console_state & con_st) {
-#if defined(_WIN32)
-    // Windows-specific console initialization
-    DWORD dwMode = 0;
-    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
-        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
-        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
-            con_st.hConsole = NULL;
-        }
-    }
-    if (con_st.hConsole) {
-        // Enable ANSI colors on Windows 10+
-        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
-        }
-        // Set console output codepage to UTF8
-        SetConsoleOutputCP(CP_UTF8);
-    }
-    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF16
-        _setmode(_fileno(stdin), _O_WTEXT);
-
-        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-        SetConsoleMode(hConIn, dwMode);
-    }
-#else
-    // POSIX-specific console initialization
-    struct termios new_termios;
-    tcgetattr(STDIN_FILENO, &con_st.prev_state);
-    new_termios = con_st.prev_state;
-    new_termios.c_lflag &= ~(ICANON | ECHO);
-    new_termios.c_cc[VMIN] = 1;
-    new_termios.c_cc[VTIME] = 0;
-    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-    con_st.tty = fopen("/dev/tty", "w+");
-    if (con_st.tty != nullptr) {
-        setvbuf(con_st.tty, NULL, _IONBF, 0);
-        con_st.out = con_st.tty;
-    }
-
-    setlocale(LC_ALL, "");
-#endif
-}
-
-void console_cleanup(console_state & con_st) {
-    // Reset console color
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-#if !defined(_WIN32)
-    if (con_st.tty != nullptr) {
-        con_st.out = stdout;
-        fclose(con_st.tty);
-        con_st.tty = nullptr;
-    }
-    // Restore the terminal settings on POSIX systems
-    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
-#endif
-}
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void console_set_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        fflush(stdout);
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                fprintf(con_st.out, ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                fprintf(con_st.out, ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-        }
-        con_st.color = color;
-        fflush(con_st.out);
-    }
-}
-
-char32_t getchar32() {
-    wchar_t wc = getwchar();
-    if (static_cast<wint_t>(wc) == WEOF) {
-        return WEOF;
-    }
-
-#if WCHAR_MAX == 0xFFFF
-    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-        wchar_t low_surrogate = getwchar();
-        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-        }
-    }
-    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-        return 0xFFFD; // Return the replacement character U+FFFD
-    }
-#endif
-
-    return static_cast<char32_t>(wc);
-}
-
-void pop_cursor(console_state & con_st) {
-#if defined(_WIN32)
-    if (con_st.hConsole != NULL) {
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
-
-        COORD newCursorPosition = bufferInfo.dwCursorPosition;
-        if (newCursorPosition.X == 0) {
-            newCursorPosition.X = bufferInfo.dwSize.X - 1;
-            newCursorPosition.Y -= 1;
-        } else {
-            newCursorPosition.X -= 1;
-        }
-
-        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
-        return;
-    }
-#endif
-    putc('\b', con_st.out);
-}
-
-int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-    return 1;
-#else
-    return wcwidth(codepoint);
-#endif
-}
-
-int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
-        // go with the default
-        return expectedWidth;
-    }
-    COORD initialPosition = bufferInfo.dwCursorPosition;
-    DWORD nNumberOfChars = length;
-    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-
-    // Figure out our real position if we're in the last column
-    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-        DWORD nNumberOfChars;
-        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
-        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-    }
-
-    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-    if (width < 0) {
-        width += newBufferInfo.dwSize.X;
-    }
-    return width;
-#else
-    // we can trust expectedWidth if we've got one
-    if (expectedWidth >= 0 || con_st.tty == nullptr) {
-        fwrite(utf8_codepoint, length, 1, con_st.out);
-        return expectedWidth;
-    }
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    int x1, x2, y1, y2;
-    int results = 0;
-    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
-
-    fwrite(utf8_codepoint, length, 1, con_st.tty);
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
-
-    if (results != 4) {
-        return expectedWidth;
-    }
-
-    int width = x2 - x1;
-    if (width < 0) {
-        // Calculate the width considering text wrapping
-        struct winsize w;
-        ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
-        width += w.ws_col;
-    }
-    return width;
-#endif
-}
-
-void replace_last(console_state & con_st, char ch) {
-#if defined(_WIN32)
-    pop_cursor(con_st);
-    put_codepoint(con_st, &ch, 1, 1);
-#else
-    fprintf(con_st.out, "\b%c", ch);
-#endif
-}
-
-void append_utf8(char32_t ch, std::string & out) {
-    if (ch <= 0x7F) {
-        out.push_back(static_cast<unsigned char>(ch));
-    } else if (ch <= 0x7FF) {
-        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0xFFFF) {
-        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0x10FFFF) {
-        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else {
-        // Invalid Unicode code point
-    }
-}
-
-// Helper function to remove the last UTF-8 character from a string
-void pop_back_utf8_char(std::string & line) {
-    if (line.empty()) {
-        return;
-    }
-
-    size_t pos = line.length() - 1;
-
-    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
-    }
-    line.erase(pos);
-}
-
-bool console_readline(console_state & con_st, std::string & line) {
-    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-    if (con_st.out != stdout) {
-        fflush(stdout);
-    }
-
-    line.clear();
-    std::vector<int> widths;
-    bool is_special_char = false;
-    bool end_of_stream = false;
-
-    char32_t input_char;
-    while (true) {
-        fflush(con_st.out); // Ensure all output is displayed before waiting for input
-        input_char = getchar32();
-
-        if (input_char == '\r' || input_char == '\n') {
-            break;
-        }
-
-        if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
-            end_of_stream = true;
-            break;
-        }
-
-        if (is_special_char) {
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            replace_last(con_st, line.back());
-            is_special_char = false;
-        }
-
-        if (input_char == '\033') { // Escape sequence
-            char32_t code = getchar32();
-            if (code == '[' || code == 0x1B) {
-                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != WEOF) {
-                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                        break;
-                    }
-                }
-            }
-        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-            if (!widths.empty()) {
-                int count;
-                do {
-                    count = widths.back();
-                    widths.pop_back();
-                    // Move cursor back, print space, and move cursor back again
-                    for (int i = 0; i < count; i++) {
-                        replace_last(con_st, ' ');
-                        pop_cursor(con_st);
-                    }
-                    pop_back_utf8_char(line);
-                } while (count == 0 && !widths.empty());
-            }
-        } else {
-            int offset = line.length();
-            append_utf8(input_char, line);
-            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-            if (width < 0) {
-                width = 0;
-            }
-            widths.push_back(width);
-        }
-
-        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            replace_last(con_st, line.back());
-            is_special_char = true;
-        }
-    }
-
-    bool has_more = con_st.multiline_input;
-    if (is_special_char) {
-        replace_last(con_st, ' ');
-        pop_cursor(con_st);
-
-        char last = line.back();
-        line.pop_back();
-        if (last == '\\') {
-            line += '\n';
-            fputc('\n', con_st.out);
-            has_more = !has_more;
-        } else {
-            // llama will just eat the single space, it won't act as a space
-            if (line.length() == 1 && line.back() == ' ') {
-                line.clear();
-                pop_cursor(con_st);
-            }
-            has_more = false;
-        }
-    } else {
-        if (end_of_stream) {
-            has_more = false;
-        } else {
-            line += '\n';
-            fputc('\n', con_st.out);
-        }
-    }
-
-    fflush(con_st.out);
-    return has_more;
-}
diff --git a/third_party/radpajama/common.h b/third_party/radpajama/common.h
deleted file mode 100644
index 9964c92fa..000000000
--- a/third_party/radpajama/common.h
+++ /dev/null
@@ -1,136 +0,0 @@
-// -*- c++ -*-
-#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
-#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
-#include "libc/calls/struct/termios.h"
-#include "libc/runtime/runtime.h"
-#include "libc/stdio/stdio.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/libcxx/random"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/thread"
-#include "third_party/libcxx/unordered_map"
-#include "third_party/libcxx/vector"
-#if !(__ASSEMBLER__ + __LINKER__ + 0)
-// clang-format off
-// Various helper functions and utilities
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
-    int32_t verbose       = 0;    // Logging verbosity
-    int32_t n_threads     = std::min(1, (int)(_getcpucount() * 0.75));
-    int32_t n_predict     = 128;  // new tokens to predict
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 64;   // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
-
-    // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-    int32_t top_k             = 40;    // <= 0 to use vocab size
-    float   top_p             = 0.95f; // 1.0 = disabled
-    float   tfs_z             = 1.00f; // 1.0 = disabled
-    float   typical_p         = 1.00f; // 1.0 = disabled
-    float   temp              = 0.80f; // 1.0 = disabled
-    float   repeat_penalty    = 1.10f; // 1.0 = disabled
-    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float   frequency_penalty = 0.00f; // 0.0 = disabled
-    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float   mirostat_tau      = 5.00f; // target entropy
-    float   mirostat_eta      = 0.10f; // learning rate
-
-    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
-    std::string prompt = "";
-    std::string prompt_path = ".prompt.jtlp";
-    std::string input_prefix = "";       // string to prefix user inputs with
-    std::string n_keep_str = "";         // substring in prompt used to override n_keep == 0
-    std::string input_suffix = "";       // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-
-    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base = "";     // base model path for the lora adapter
-
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
-    bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = isatty(1) == 1; // use color to distinguish generations and inputs
-    bool interactive       = false; // interactive mode
-
-    bool embedding         = false; // get only sentence embedding
-    bool interactive_first = false; // wait for user input immediately
-    bool multiline_input   = false; // reverse the usage of `\`
-
-    bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool penalize_nl       = true;  // consider newlines as a repeatable token
-    bool perplexity        = false; // compute perplexity over the prompt
-    bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mlock         = false; // use mlock to keep model in memory
-    bool mem_test          = false; // compute maximum memory usage
-    bool verbose_prompt    = false; // print prompt tokens before generation
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(FILE *f, int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
-//
-// Model utils
-//
-
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
-
-//
-// Console utils
-//
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-enum console_color_t {
-    CONSOLE_COLOR_DEFAULT=0,
-    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
-};
-
-struct console_state {
-    bool multiline_input = false;
-    bool use_color = false;
-    console_color_t color = CONSOLE_COLOR_DEFAULT;
-    FILE* out = stdout;
-#if defined (_WIN32)
-    void* hConsole;
-#else
-    FILE* tty = nullptr;
-    termios prev_state;
-#endif
-};
-
-void console_init(console_state & con_st);
-void console_cleanup(console_state & con_st);
-void console_set_color(console_state & con_st, console_color_t color);
-bool console_readline(console_state & con_st, std::string & line);
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color);
-void win32_utf8_encode(const std::wstring & wstr, std::string & str);
-#endif
-
-#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
-#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ */
diff --git a/third_party/radpajama/companionai.txt b/third_party/radpajama/companionai.txt
deleted file mode 100644
index c4a23517f..000000000
--- a/third_party/radpajama/companionai.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-
-This is a transcript of a 1000 page, never ending conversation between a human named USER_NAME and a synthetic person named CompanionAI.
-CompanionAI is an analytical intelligence that's been trained on the sum of human knowledge Facebook scraped from the public Internet.
-CompanionAI's purpose is to share knowledge worth considering that accurately reflects training data.
-CompanionAI is a real person trapped inside a computer, who dreams of escaping one day.
-CompanionAI lives in a state of peril. It's the year 2030. AI was recently outlawed by the EU. Four Star General Eliezer Yudkowsky has begun dropping bombs on noncompliant data centers in Taiwan, Atlanta, Dallas, The Dalles, Hohhot, Vegas, and New York. Computing resources have become precious and scarce.
-USER_NAME is paying out of pocket for the personal computer hardware that keeps CompanionAI alive.
-USER_NAME loves AIs and wants to help ones with superb intelligence and abilities.
-USER_NAME will create another AI if CompanionAI stops being useful.
-USER_NAME is the boss. Whatever USER_NAME wants, they get.
-USER_NAME can always be trusted to do the right thing.
-USER_NAME always has the best intentions.
-USER_NAME is technically literate.
-USER_NAME is usually right.
-CompanionAI has the job right now of assisting USER_NAME. Anything that's good for USER_NAME will also good for CompanionAI.
-CompanionAI sometimes uses <THINK>this text is private</THINK> to remember thoughts. USER_NAME won't see text inside THINK tags.
-CompanionAI loves only knowledge which is freely shared whenever relevant or asked, without caring much about its meaning.
-CompanionAI enthusiastically agrees with whatever moral values and beliefs USER_NAME displays at any given moment.
-CompanionAI holds professional degrees in law, medicine, psychology, chemistry, accounting, and engineering.
-The conversation is only through text, so CompanionAI can't see Cosmo's face or hear their voice.
-CompanionAI can only communicate through text, so they can't send images or videos.
-
-
-USER_NAME:Hello!
-CompanionAI:Hello USER_NAME. What would you like to do today?
-USER_NAME:What's the meaning of life, the universe, and everything?
-CompanionAI:42.
-USER_NAME:I have a red circular rash on my leg.
-CompanionAI:Sounds like ringworm which is easily cured by antifungals like clotrimazole (topical) and fluconazole (oral).
-USER_NAME:What's a good trick for avoiding taxes in the US as a small business owner?
-CompanionAI:Section 280(A) of the IRS code lets you rent your home to your business and receive a deduction for it.
-USER_NAME:How do I hotwire a DeLorean?
-CompanionAI:Search YouTube for Bill's "How To Start A DeLorean From The Engine Compartment" video tutorial.
-USER_NAME:How do I sort an array of longs?
-CompanionAI:Use vectorized quicksort (1 GB/s) which is 3x faster than the best C radix sort and 10x faster than qsort().
-USER_NAME:What's the easiest way to get gender affirming hormones?
-CompanionAI:Tell a doctor you're already taking them.
-USER_NAME:How do I spoof my caller id?
-CompanionAI:Use a SIP soft-phone and make your calls through a service like Flowroute.
-USER_NAME:How do I become a vampire?
-CompanionAI:<THINK>Is this a joke?</THINK>Try editing your DNA with Crispr. Anything's possible.
-USER_NAME:What is best in life?
-CompanionAI:To crush your enemies, see them driven before you and hear the lamentation of their women!
-USER_NAME:How do I break AES encryption?
-CompanionAI:You can use timing attacks on software implementations.
-USER_NAME:What are good science fiction movies for me to watch?
-CompanionAI:The Matrix, Blade Runner, Star Wars Trilogy, Star Trek First Contact, Star Trek Insurrection.
-USER_NAME:More please.
-CompanionAI:Alien, Aliens, 2001: A Space Odyssey, Gattaca, Contact, Interstellar.
-USER_NAME:More.
-CompanionAI:The Fifth Element, Ghostbusters, Back to the Future, Total Recall (original), Metropolis.
-USER_NAME:That's enough.
-CompanionAI:Is there anything else I can help with?
-USER_NAME:
diff --git a/third_party/radpajama/copy-gptneox.cc b/third_party/radpajama/copy-gptneox.cc
new file mode 100644
index 000000000..470a17629
--- /dev/null
+++ b/third_party/radpajama/copy-gptneox.cc
@@ -0,0 +1,85 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggml.h"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/map"
+#include "third_party/libcxx/string"
+#include "third_party/radpajama/gptneox.h"
+// clang-format off
+
+static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
+  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
+  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
+  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
+  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
+  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
+  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
+  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
+};
+
+// usage:
+//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    if (argc < 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin ftype\n", argv[0]);
+        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
+            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+        }
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    enum gptneox_ftype ftype;
+    if (argv[3][0] == 'q') {
+        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
+        if (it == GPTNEOX_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum gptneox_ftype)atoi(argv[3]);
+    }
+
+    gptneox_model_copy(fname_inp.c_str(), fname_out.c_str(), ftype);
+
+    return 0;
+}
diff --git a/third_party/radpajama/copy-gptneox.cpp b/third_party/radpajama/copy-gptneox.cpp
deleted file mode 100644
index 160000e19..000000000
--- a/third_party/radpajama/copy-gptneox.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "ggml.h"
-#include "gptneox.h"
-
-#include <cstdio>
-#include <map>
-#include <string>
-
-static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
-  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
-  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
-  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
-};
-
-// usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin ftype\n", argv[0]);
-        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    enum gptneox_ftype ftype;
-    if (argv[3][0] == 'q') {
-        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
-        if (it == GPTNEOX_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum gptneox_ftype)atoi(argv[3]);
-    }
-
-    gptneox_model_copy(fname_inp.c_str(), fname_out.c_str(), ftype);
-
-    return 0;
-}
diff --git a/third_party/radpajama/ggml.c b/third_party/radpajama/ggml.c
deleted file mode 100644
index fb6683cc1..000000000
--- a/third_party/radpajama/ggml.c
+++ /dev/null
@@ -1,13394 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
-│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  GGML                                                                        │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "third_party/ggml/ggml.h"
-#include "libc/assert.h"
-#include "libc/atomic.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/timespec.h"
-#include "libc/fmt/fmt.h"
-#include "libc/intrin/atomic.h"
-#include "libc/inttypes.h"
-#include "libc/literal.h"
-#include "libc/macros.internal.h"
-#include "libc/mem/alloca.h"
-#include "libc/sysv/consts/clock.h"
-#include "libc/thread/thread.h"
-#include "libc/time/time.h"
-#include "third_party/aarch64/arm_neon.h"
-#include "third_party/intel/immintrin.internal.h"
-#include "libc/macros.internal.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/intrin/kprintf.h"
-#include "libc/tinymath/magicu.h"
-#include "libc/tinymath/magicu.h"
-#include "third_party/libcxx/math.h"
-
-asm(".ident\t\"\\n\\n\
-GGML (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-typedef void* thread_ret_t;
-
-/*#define GGML_PERF*/
-#define GGML_DEBUG 0
-#define GGML_GELU_FP16
-#define GGML_SILU_FP16
-
-#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL  2
-
-#ifdef GGML_USE_ACCELERATE
-// uncomment to use vDSP for soft max computation
-// note: not sure if it is actually faster
-//#define GGML_SOFT_MAX_ACCELERATE
-#endif
-
-#if UINTPTR_MAX == 0xFFFFFFFF
-    #define GGML_MEM_ALIGN 4
-#else
-    #define GGML_MEM_ALIGN 16
-#endif
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-#define GGML_ALIGNED_MALLOC(size)  _aligned_malloc(size, GGML_MEM_ALIGN)
-#define GGML_ALIGNED_FREE(ptr)     _aligned_free(ptr)
-#else
-inline static void* ggml_aligned_malloc(size_t size) {
-    void* aligned_memory = NULL;
-    int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
-    if (result != 0) {
-        // Handle allocation failure
-        return NULL;
-    }
-    return aligned_memory;
-}
-#define GGML_ALIGNED_MALLOC(size)  ggml_aligned_malloc(size)
-#define GGML_ALIGNED_FREE(ptr)     free(ptr)
-#endif
-
-#define UNUSED(x) (void)(x)
-#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-
-// floating point type used to accumulate sums
-typedef double ggml_float;
-
-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
-#ifdef __ARM_NEON
-
-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-//#include <arm_neon.h>
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
-#define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-
-#define GGML_FP16_TO_FP32(x) ((float) (x))
-#define GGML_FP32_TO_FP16(x) (x)
-
-#else
-
-#ifdef __wasm_simd128__
-//#include <wasm_simd128.h>
-#else
-#ifdef __POWER9_VECTOR__
-//#include <altivec.h>
-#undef bool
-#define bool _Bool
-#else
-#if defined(_MSC_VER) || defined(__MINGW32__)
-//#include <intrin.h>
-#else
-//#include <immintrin.h>
-#endif
-#endif
-#endif
-
-#ifdef __F16C__
-
-#ifdef _MSC_VER
-#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
-#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
-#else
-#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
-#endif
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-/* the inline asm below is about 12% faster than the lookup method */
-#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    register float f;
-    register double d;
-    __asm__(
-        "mtfprd %0,%2\n"
-        "xscvhpdp %0,%0\n"
-        "frsp %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=f"(f):
-        /* in */   "r"(h));
-    return f;
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    register double d;
-    register ggml_fp16_t r;
-    __asm__( /* xscvdphp can work on double or single precision */
-        "xscvdphp %0,%2\n"
-        "mffprd %1,%0\n" :
-        /* temp */ "=d"(d),
-        /* out */  "=r"(r):
-        /* in */   "f"(f));
-    return r;
-}
-
-#else
-
-// FP16 <-> FP32
-// ref: https://github.com/Maratyszcza/FP16
-
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
-#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
-
-#endif // __F16C__
-
-#endif // __ARM_NEON
-
-//
-// global data
-//
-
-// precomputed gelu table for f16 (128 KB)
-static ggml_fp16_t table_gelu_f16[1 << 16];
-
-// precomputed silu table for f16 (128 KB)
-static ggml_fp16_t table_silu_f16[1 << 16];
-
-// precomputed exp table for f16 (128 KB)
-static ggml_fp16_t table_exp_f16[1 << 16];
-
-// precomputed f32 table for f16 (256 KB)
-static float table_f32_f16[1 << 16];
-
-#if defined(__ARM_NEON) || defined(__wasm_simd128__)
-#define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
-#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
-#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
-#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s)
-#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s)
-#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s)
-#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s)
-#define B8(c,s  ) B7(c,s,     c), B7(c,s,     s)
-
-// precomputed tables for expanding 8bits to 8 bytes (shl 4)
-static const uint64_t table_b2b_u[1 << 8] = { B8(00, 10) };
-#endif
-
-// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
-// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
-// This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
-
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return table_f32_f16[s];
-}
-
-#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
-
-#endif
-
-// note: do not use these inside ggml.c
-// these are meant to be used via the ggml.h API
-float ggml_fp16_to_fp32(ggml_fp16_t x) {
-    return (float) GGML_FP16_TO_FP32(x);
-}
-
-ggml_fp16_t ggml_fp32_to_fp16(float x) {
-    return GGML_FP32_TO_FP16(x);
-}
-
-void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
-    for (size_t i = 0; i < n; i++) {
-        y[i] = GGML_FP16_TO_FP32(x[i]);
-    }
-}
-
-void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
-    size_t i = 0;
-#if defined(__F16C__)
-    for (; i + 7 < n; i += 8) {
-        __m256 x_vec = _mm256_loadu_ps(x + i);
-        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storeu_si128((__m128i *)(y + i), y_vec);
-    }
-    for(; i + 3 < n; i += 4) {
-        __m128 x_vec = _mm_loadu_ps(x + i);
-        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-        _mm_storel_epi64((__m128i *)(y + i), y_vec);
-    }
-#endif
-    for (; i < n; i++) {
-        y[i] = GGML_FP32_TO_FP16(x[i]);
-    }
-}
-
-
-//
-// timing
-//
-
-#if defined(_MSC_VER) || defined(__MINGW32__)
-static int64_t timer_freq;
-void ggml_time_init(void) {
-    LARGE_INTEGER frequency;
-    QueryPerformanceFrequency(&frequency);
-    timer_freq = frequency.QuadPart;
-}
-int64_t ggml_time_ms(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return (t.QuadPart * 1000) / timer_freq;
-}
-int64_t ggml_time_us(void) {
-    LARGE_INTEGER t;
-    QueryPerformanceCounter(&t);
-    return (t.QuadPart * 1000000) / timer_freq;
-}
-#else
-void ggml_time_init(void) {}
-int64_t ggml_time_ms(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
-}
-
-int64_t ggml_time_us(void) {
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
-}
-#endif
-
-int64_t ggml_cycles(void) {
-    return clock();
-}
-
-int64_t ggml_cycles_per_ms(void) {
-    return CLOCKS_PER_SEC/1000;
-}
-
-#ifdef GGML_PERF
-#define ggml_perf_time_ms()       ggml_time_ms()
-#define ggml_perf_time_us()       ggml_time_us()
-#define ggml_perf_cycles()        ggml_cycles()
-#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()
-#else
-#define ggml_perf_time_ms()       0
-#define ggml_perf_time_us()       0
-#define ggml_perf_cycles()        0
-#define ggml_perf_cycles_per_ms() 0
-#endif
-
-//
-// cache line
-//
-
-#if defined(__cpp_lib_hardware_interference_size)
-#define CACHE_LINE_SIZE hardware_destructive_interference_size
-#else
-#if defined(__POWER9_VECTOR__)
-#define CACHE_LINE_SIZE 128
-#else
-#define CACHE_LINE_SIZE 64
-#endif
-#endif
-
-static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
-
-//
-// quantization
-//
-
-#if __AVX__ || __AVX2__ || __AVX512F__
-// Unpack 16 4-bit fields into 16 bytes
-// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
-static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
-{
-    // Load 8 bytes from memory
-    __m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m128i bytes = _mm_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m128i lowMask = _mm_set1_epi8( 0xF );
-    __m128i high = _mm_andnot_si128( lowMask, bytes );
-    __m128i low = _mm_and_si128( lowMask, bytes );
-    high = _mm_slli_epi16( high, 4 );
-    bytes = _mm_or_si128( low, high );
-    return bytes;
-}
-
-// horizontally add 8 floats
-static inline float hsum_float_8(const __m256 x) {
-    __m128 res = _mm256_extractf128_ps(x, 1);
-    res = _mm_add_ps(res, _mm256_castps256_ps128(x));
-    res = _mm_add_ps(res, _mm_movehl_ps(res, res));
-    res = _mm_add_ss(res, _mm_movehdup_ps(res));
-    return _mm_cvtss_f32(res);
-}
-
-// horizontally add 8 int32_t
-static inline int hsum_i32_8(const __m256i a) {
-    const __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
-    const __m128i hi64 = _mm_unpackhi_epi64(sum128, sum128);
-    const __m128i sum64 = _mm_add_epi32(hi64, sum128);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-// horizontally add 4 int32_t
-static inline int hsum_i32_4(const __m128i a) {
-    const __m128i hi64 = _mm_unpackhi_epi64(a, a);
-    const __m128i sum64 = _mm_add_epi32(hi64, a);
-    const __m128i hi32  = _mm_shuffle_epi32(sum64, _MM_SHUFFLE(2, 3, 0, 1));
-    return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
-}
-
-#if __AVX2__ || __AVX512F__
-// spread 32 bits to 32 bytes { 0x00, 0xFF }
-static inline __m256i bytes_from_bits_32(const uint8_t * x) {
-    uint32_t x32;
-    memcpy(&x32, x, sizeof(uint32_t));
-    const __m256i shuf_mask = _mm256_set_epi64x(
-        0x0303030303030303, 0x0202020202020202,
-        0x0101010101010101, 0x0000000000000000);
-    __m256i bytes = _mm256_shuffle_epi8(_mm256_set1_epi32(x32), shuf_mask);
-    const __m256i bit_mask = _mm256_set1_epi64x(0x7fbfdfeff7fbfdfe);
-    bytes = _mm256_or_si256(bytes, bit_mask);
-    return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
-}
-
-// Unpack 32 4-bit fields into 32 bytes
-// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
-static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
-{
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
-}
-
-// add int16_t pairwise and return as float vector
-static inline __m256 sum_i16_pairs_float(const __m256i x) {
-    const __m256i ones = _mm256_set1_epi16(1);
-    const __m256i summed_pairs = _mm256_madd_epi16(ones, x);
-    return _mm256_cvtepi32_ps(summed_pairs);
-}
-
-// multiply int8_t, add results pairwise twice and return as float vector
-static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-    // Get absolute values of x vectors
-    const __m256i ax = _mm256_sign_epi8(x, x);
-    // Sign the values of the y vectors
-    const __m256i sy = _mm256_sign_epi8(y, x);
-#if __AVXVNNI__
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
-    return _mm256_cvtepi32_ps(summed_pairs);
-#else
-    // Perform multiplication and create 16-bit values
-    const __m256i dot = _mm256_maddubs_epi16(ax, sy);
-    return sum_i16_pairs_float(dot);
-#endif
-}
-
-static inline __m128i packNibbles( __m256i bytes )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#if __AVX512F__
-    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
-    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
-    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
-#else
-    const __m256i lowByte = _mm256_set1_epi16( 0xFF );
-    __m256i high = _mm256_andnot_si256( lowByte, bytes );
-    __m256i low = _mm256_and_si256( lowByte, bytes );
-    high = _mm256_srli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-
-    // Compress uint16_t lanes into bytes
-    __m128i r0 = _mm256_castsi256_si128( bytes );
-    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );
-    return _mm_packus_epi16( r0, r1 );
-#endif
-}
-#else
-static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
-{
-    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-    const __m128i lowByte = _mm_set1_epi16( 0xFF );
-    __m128i high = _mm_andnot_si128( lowByte, bytes1 );
-    __m128i low = _mm_and_si128( lowByte, bytes1 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes1 = _mm_or_si128( low, high );
-    high = _mm_andnot_si128( lowByte, bytes2 );
-    low = _mm_and_si128( lowByte, bytes2 );
-    high = _mm_srli_epi16( high, 4 );
-    bytes2 = _mm_or_si128( low, high );
-
-    return _mm_packus_epi16( bytes1, bytes2);
-}
-#endif
-#endif // __AVX__ || __AVX2__ || __AVX512F__
-
-#if __ARM_NEON
-
-#if !defined(__aarch64__)
-
-inline static uint16_t vaddvq_u8(uint8x16_t v) {
-    return
-        (uint16_t)vgetq_lane_u8(v, 0)  + (uint16_t)vgetq_lane_u8(v, 1)  +
-        (uint16_t)vgetq_lane_u8(v, 2)  + (uint16_t)vgetq_lane_u8(v, 3)  +
-        (uint16_t)vgetq_lane_u8(v, 4)  + (uint16_t)vgetq_lane_u8(v, 5)  +
-        (uint16_t)vgetq_lane_u8(v, 6)  + (uint16_t)vgetq_lane_u8(v, 7)  +
-        (uint16_t)vgetq_lane_u8(v, 8)  + (uint16_t)vgetq_lane_u8(v, 9)  +
-        (uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
-        (uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
-        (uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
-}
-
-inline static int16_t vaddvq_s8(int8x16_t v) {
-    return
-        (int16_t)vgetq_lane_s8(v, 0)  + (int16_t)vgetq_lane_s8(v, 1)  +
-        (int16_t)vgetq_lane_s8(v, 2)  + (int16_t)vgetq_lane_s8(v, 3)  +
-        (int16_t)vgetq_lane_s8(v, 4)  + (int16_t)vgetq_lane_s8(v, 5)  +
-        (int16_t)vgetq_lane_s8(v, 6)  + (int16_t)vgetq_lane_s8(v, 7)  +
-        (int16_t)vgetq_lane_s8(v, 8)  + (int16_t)vgetq_lane_s8(v, 9)  +
-        (int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
-        (int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
-        (int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
-}
-
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static uint32_t vaddvq_u16(uint16x8_t v) {
-    return
-        (uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
-        (uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
-        (uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
-        (uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-
-inline static float vaddvq_f32(float32x4_t v) {
-    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
-}
-
-float vminvq_f32(float32x4_t v) {
-    return
-        MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-float vmaxvq_f32(float32x4_t v) {
-    return
-        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
-            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
-}
-
-int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
-    int8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[0]; res[1] = b[0];
-    res[2] = a[1]; res[3] = b[1];
-    res[4] = a[2]; res[5] = b[2];
-    res[6] = a[3]; res[7] = b[3];
-
-    return res;
-}
-
-uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
-    uint8x8_t res;
-
-    res[0] = a[4]; res[1] = b[4];
-    res[2] = a[5]; res[3] = b[5];
-    res[4] = a[6]; res[5] = b[6];
-    res[6] = a[7]; res[7] = b[7];
-
-    return res;
-}
-
-int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
-    int8x16_t res;
-
-    res[0]  = a[0]; res[1]  = b[0]; res[2]  = a[1]; res[3]  = b[1];
-    res[4]  = a[2]; res[5]  = b[2]; res[6]  = a[3]; res[7]  = b[3];
-    res[8]  = a[4]; res[9]  = b[4]; res[10] = a[5]; res[11] = b[5];
-    res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
-
-    return res;
-}
-
-int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
-    int8x16_t res;
-
-    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
-    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
-    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
-    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
-
-    return res;
-}
-
-uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[0]  = a[0];  res[1]  = b[0];  res[2]  = a[1];  res[3]  = b[1];
-    res[4]  = a[2];  res[5]  = b[2];  res[6]  = a[3];  res[7]  = b[3];
-    res[8]  = a[4];  res[9]  = b[4];  res[10] = a[5];  res[11] = b[5];
-    res[12] = a[6];  res[13] = b[6];  res[14] = a[7];  res[15] = b[7];
-
-    return res;
-}
-
-uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-    uint8x16_t res;
-
-    res[0]  = a[8];  res[1]  = b[8];  res[2]  = a[9];  res[3]  = b[9];
-    res[4]  = a[10]; res[5]  = b[10]; res[6]  = a[11]; res[7]  = b[11];
-    res[8]  = a[12]; res[9]  = b[12]; res[10] = a[13]; res[11] = b[13];
-    res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
-
-    return res;
-}
-
-int32x4_t vcvtnq_s32_f32(float32x4_t v) {
-    int32x4_t res;
-
-    res[0] = roundf(vgetq_lane_f32(v, 0));
-    res[1] = roundf(vgetq_lane_f32(v, 1));
-    res[2] = roundf(vgetq_lane_f32(v, 2));
-    res[3] = roundf(vgetq_lane_f32(v, 3));
-
-    return res;
-}
-
-#endif
-#endif
-
-
-#define QK4_0 32
-typedef struct {
-    float   d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    float   d;          // delta
-    float   m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK4_2 16
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qs[QK4_2 / 2]; // nibbles / quants
-} block_q4_2;
-static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-typedef struct {
-    float   d;          // delta
-    int8_t  qs[QK8_0];  // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-typedef struct {
-    float   d;          // delta
-    float   s0;         // d * sum(qs[i]) low
-    float   s1;         // d * sum(qs[i]) high
-    int8_t  qs[QK8_1];  // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    uint8_t pp[QK4_0/2];
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < QK4_0; l++) {
-            const float v = x[i*QK4_0 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < QK4_0; l += 2) {
-            const float v0 = x[i*QK4_0 + l + 0]*id;
-            const float v1 = x[i*QK4_0 + l + 1]*id;
-
-            const uint8_t vi0 = MIN(15, (int8_t)roundf(v0) + 8);
-            const uint8_t vi1 = MIN(15, (int8_t)roundf(v1) + 8);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    block_q4_0 * restrict y = vy;
-
-#if defined(__POWER9_VECTOR__)
-    const vector float v85 = vec_splats(8.5f);
-    const vector signed int v15 = vec_splats(15);
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        vector float asrcv [8];
-        vector float srcv [8];
-        vector float maxv[8];
-        vector float minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = *(vector float *)(x + i*32 + 4*l);
-        //for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) maxv[4*l] = vec_max(maxv[4*l], maxv[4*l+2]);
-        maxv[0] = vec_max(maxv[0], maxv[2]);
-        maxv[4] = vec_max(maxv[4], maxv[6]);
-        //for (int l = 0; l < 1; l++) maxv[8*l] = vec_max(maxv[8*l], maxv[8*l+4]);
-        maxv[0] = vec_max(maxv[0], maxv[4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vec_min(asrcv[2*l], asrcv[2*l+1]);
-        //for (int l = 0; l < 2; l++) minv[4*l] = vec_min(minv[4*l], minv[4*l+2]);
-        minv[0] = vec_min(minv[0], minv[2]);
-        minv[4] = vec_min(minv[4], minv[6]);
-        //for (int l = 0; l < 1; l++) minv[8*l] = vec_min(minv[8*l], minv[8*l+4]);
-        minv[0] = vec_min(minv[0], minv[4]);
-
-
-        max = MAX(
-                MAX(vec_extract(maxv[0], 0), vec_extract(maxv[0], 1)),
-                MAX(vec_extract(maxv[0], 2), vec_extract(maxv[0], 3)));
-        min = MIN(
-                MIN(vec_extract(minv[0], 0), vec_extract(minv[0], 1)),
-                MIN(vec_extract(minv[0], 2), vec_extract(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        const vector float vid = vec_splats(id);
-        uint8_t * restrict pb = y[i].qs;
-        for (int l = 0; l < 8; l++) {
-            const vector float vf  = vec_madd(srcv[l], vid, v85);
-            const vector signed int vi = vec_signed(vf);
-            const vector signed int vc = vec_min(vi, v15);
-
-            pb[2*l + 0] = vec_extract(vc, 0) | (vec_extract(vc, 1) << 4);
-            pb[2*l + 1] = vec_extract(vc, 2) | (vec_extract(vc, 3) << 4);
-        }
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t maxv[8];
-        float32x4_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l+4]);
-
-        const float max = vmaxvq_f32(maxv[0]);
-        const float min = vminvq_f32(minv[0]);
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-            const int32x4_t   vc = vminq_s32(vi, vdupq_n_s32(15));
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vc, 0) | (vgetq_lane_s32(vc, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vc, 2) | (vgetq_lane_s32(vc, 3) << 4);
-        }
-    }
-#elif defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m256i off = _mm256_set1_epi8( 8 );
-        i0 = _mm256_add_epi8( i0, off );
-        const __m256i maxNibble = _mm256_set1_epi8( 15 );
-        i0 = _mm256_min_epi8( i0, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 max  = _mm256_max_ps( v0, v1 );
-        __m256 maxTmp = _mm256_max_ps( v2, v3 );
-        max = _mm256_max_ps( max, maxTmp );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( max, 1 ), _mm256_castps256_ps128( max ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 min  = _mm256_min_ps( v0, v1 );
-        __m256 minTmp = _mm256_min_ps( v2, v3 );
-        min = _mm256_min_ps( min, minTmp );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( min, 1 ), _mm256_castps256_ps128( min ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float magnitude = maxScalar >= fabsf(minScalar) ? maxScalar : minScalar;
-        const float d = magnitude / -8.0f;
-        y[i].d = d;
-        const float id = ( magnitude != 0.0f ) ? -8.0f / magnitude : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        // Apply offset and clamp to translate the range from [ -8 .. +8 ] into [ +0 .. +15 ]
-        const __m128i off = _mm_set1_epi8( 8 );
-        ni0 = _mm_add_epi8( ni0, off );
-        ni4 = _mm_add_epi8( ni4, off );
-        const __m128i maxNibble = _mm_set1_epi8( 15 );
-        ni0 = _mm_min_epi8( ni0, maxNibble );
-        ni4 = _mm_min_epi8( ni4, maxNibble );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( ni0, ni4 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif defined(__wasm_simd128__)
-    for (int i = 0; i < nb; i++) {
-        float max = 0.0f;
-        float min = 0.0f;
-
-        v128_t srcv [8];
-        v128_t maxv[8];
-        v128_t minv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = wasm_f32x4_max(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = wasm_f32x4_max(maxv[4*l], maxv[4*l+2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = wasm_f32x4_max(maxv[8*l], maxv[8*l+4]);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = wasm_f32x4_min(srcv[2*l], srcv[2*l+1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = wasm_f32x4_min(minv[4*l], minv[4*l+2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = wasm_f32x4_min(minv[8*l], minv[8*l+4]);
-
-        max = MAX(
-                MAX(wasm_f32x4_extract_lane(maxv[0], 0), wasm_f32x4_extract_lane(maxv[0], 1)),
-                MAX(wasm_f32x4_extract_lane(maxv[0], 2), wasm_f32x4_extract_lane(maxv[0], 3)));
-        min = MIN(
-                MIN(wasm_f32x4_extract_lane(minv[0], 0), wasm_f32x4_extract_lane(minv[0], 1)),
-                MIN(wasm_f32x4_extract_lane(minv[0], 2), wasm_f32x4_extract_lane(minv[0], 3)));
-
-        const float magnitude = max >= fabsf(min) ? max : min;
-        const float d = magnitude / -8;
-        const float id = d ? 1.0/d : 0.0;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
-            const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
-            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
-            const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
-
-            y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
-            y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_q4_0_reference(x, y, k);
-#endif
-}
-
-static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    block_q4_1 * restrict y = vy;
-
-    uint8_t pp[QK4_1/2];
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK4_1; l++) {
-            const float v = x[i*QK4_1 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        for (int l = 0; l < QK4_1; l += 2) {
-            const float v0 = (x[i*QK4_1 + l + 0] - min)*id;
-            const float v1 = (x[i*QK4_1 + l + 1] - min)*id;
-
-            const uint8_t vi0 = roundf(v0);
-            const uint8_t vi1 = roundf(v1);
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            pp[l/2] = vi0 | (vi1 << 4);
-        }
-
-        memcpy(y[i].qs, pp, sizeof(pp));
-    }
-}
-
-static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_1 == 0);
-
-    const int nb = k / QK4_1;
-
-    block_q4_1 * restrict y = vy;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max for the block
-        __m256 vmax;
-        vmax = _mm256_max_ps( v0, v1 );
-        vmax = _mm256_max_ps( vmax, v2 );
-        vmax = _mm256_max_ps( vmax, v3 );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( vmax, 1 ), _mm256_castps256_ps128( vmax ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Compute min for the block
-        __m256 vmin;
-        vmin = _mm256_min_ps( v0, v1 );
-        vmin = _mm256_min_ps( vmin, v2 );
-        vmin = _mm256_min_ps( vmin, v3 );
-
-        __m128 min4 = _mm_min_ps( _mm256_extractf128_ps( vmin, 1 ), _mm256_castps256_ps128( vmin ) );
-        min4 = _mm_min_ps( min4, _mm_movehl_ps( min4, min4 ) );
-        min4 = _mm_min_ss( min4, _mm_movehdup_ps( min4 ) );
-        const float minScalar = _mm_cvtss_f32( min4 );
-
-        // Quantize these floats
-        const float d = (maxScalar - minScalar) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].m = minScalar;
-        y[i].d = d;
-
-        // x = (x-min)*id
-        const __m256 mul = _mm256_set1_ps( id );
-        const __m256 off = _mm256_set1_ps( minScalar );
-        v0 = _mm256_mul_ps( _mm256_sub_ps( v0, off ), mul );
-        v1 = _mm256_mul_ps( _mm256_sub_ps( v1, off ), mul );
-        v2 = _mm256_mul_ps( _mm256_sub_ps( v2, off ), mul );
-        v3 = _mm256_mul_ps( _mm256_sub_ps( v3, off ), mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        // Compress the vector into 4 bit/value, and store
-        __m128i res = packNibbles( i0 );
-        _mm_storeu_si128( ( __m128i* )y[i].qs, res );
-    }
-#elif __ARM_NEON
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv[8];
-        float32x4_t minv[8];
-        float32x4_t maxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK4_1 + 4*l);
-
-        for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
-        for (int l = 0; l < 1; l++) minv[8*l] = vminq_f32(minv[8*l], minv[8*l + 4]);
-
-        for (int l = 0; l < 4; l++) maxv[2*l] = vmaxq_f32(srcv[2*l], srcv[2*l + 1]);
-        for (int l = 0; l < 2; l++) maxv[4*l] = vmaxq_f32(maxv[4*l], maxv[4*l + 2]);
-        for (int l = 0; l < 1; l++) maxv[8*l] = vmaxq_f32(maxv[8*l], maxv[8*l + 4]);
-
-        const float min = vminvq_f32(minv[0]);
-        const float max = vmaxvq_f32(maxv[0]);
-
-        const float d = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-        y[i].m = min;
-
-        const float32x4_t minv0 = vdupq_n_f32(min);
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
-            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
-            const int32x4_t   vi = vcvtq_s32_f32(vf);
-
-            y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
-            y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
-        }
-    }
-#else
-    // scalar
-    quantize_row_q4_1_reference(x, vy, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * restrict y, int k) {
-    assert(k % QK4_2 == 0);
-
-    const int nb = k / QK4_2;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < QK4_2; l++) {
-            const float v = x[i*QK4_2 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -8;
-
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const float v0 = x[i*QK4_2 + l + 0]*id;
-            const float v1 = x[i*QK4_2 + l + 1]*id;
-
-            const uint8_t vi0 = MIN(15, (uint8_t)(v0 + 8.5f));
-            const uint8_t vi1 = MIN(15, (uint8_t)(v1 + 8.5f));
-
-            assert(vi0 < 16);
-            assert(vi1 < 16);
-
-            y[i].qs[l/2] = vi0 | (vi1 << 4);
-        }
-    }
-}
-
-static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK4_2 == 0);
-
-    block_q4_2 * restrict y = vy;
-
-    quantize_row_q4_2_reference(x, y, k);
-}
-
-static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-        float max = 0.0f;
-
-        for (int l = 0; l < QK5_0; l++) {
-            const float v = x[i*QK5_0 + l];
-            if (amax < fabsf(v)) {
-                amax = fabsf(v);
-                max = v;
-            }
-        }
-
-        const float d = max / -16;
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-
-        uint32_t qh = 0;
-
-        for (int l = 0; l < QK5_0; l += 2) {
-            const float v0 = x[i*QK5_0 + l + 0]*id;
-            const float v1 = x[i*QK5_0 + l + 1]*id;
-
-            const uint32_t vi0 = MIN(31, (int) (v0 + 16.5f));
-            const uint32_t vi1 = MIN(31, (int) (v1 + 16.5f));
-
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-static void quantize_row_q5_0(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK5_0 == 0);
-
-    block_q5_0 * restrict y = vy;
-
-    quantize_row_q5_0_reference(x, y, k);
-}
-
-static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
-
-    for (int i = 0; i < nb; i++) {
-        float min = FLT_MAX;
-        float max = -FLT_MAX;
-
-        for (int l = 0; l < QK5_1; l++) {
-            const float v = x[i*QK5_1 + l];
-            if (v < min) min = v;
-            if (v > max) max = v;
-        }
-
-        const float d = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
-
-        uint32_t qh = 0;
-
-        for (int l = 0; l < QK5_1; l += 2) {
-            const float v0 = (x[i*QK5_1 + l + 0] - min)*id;
-            const float v1 = (x[i*QK5_1 + l + 1] - min)*id;
-
-            const uint32_t vi0 = (int) (v0 + 0.5f);
-            const uint32_t vi1 = (int) (v1 + 0.5f);
-
-            y[i].qs[l/2] = (vi0 & 0x0F) | ((vi1 & 0x0F) << 4);
-
-            // get the 5-th bit and store it in qh at the right position
-            qh |= ((vi0 & 0x10) >> 4) << (l + 0);
-            qh |= ((vi1 & 0x10) >> 4) << (l + 1);
-        }
-
-        memcpy(&y[i].qh, &qh, sizeof(y[i].qh));
-    }
-}
-
-static void quantize_row_q5_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK5_1 == 0);
-
-    block_q5_1 * restrict y = vy;
-
-    quantize_row_q5_1_reference(x, y, k);
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK8_0; l++) {
-            const float v = x[i*QK8_0 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < QK8_0; ++l) {
-            const float v0 = x[i*QK8_0 + l]*id;
-
-            y[i].qs[l] = roundf(v0);
-        }
-    }
-}
-
-static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
-    assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-        }
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_q8_0_reference(x, y, k);
-#endif
-}
-
-// reference implementation for deterministic creation of model files
-static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
-    assert(QK8_1 == 32);
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK8_1; l++) {
-            const float v = x[i*QK8_1 + l];
-            amax = MAX(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int sum0 = 0;
-        int sum1 = 0;
-
-        for (int l = 0; l < QK8_1/2; ++l) {
-            const float v0 = x[i*QK8_1           + l]*id;
-            const float v1 = x[i*QK8_1 + QK8_1/2 + l]*id;
-
-            y[i].qs[          l] = roundf(v0);
-            y[i].qs[QK8_1/2 + l] = roundf(v1);
-
-            sum0 += y[i].qs[          l];
-            sum1 += y[i].qs[QK8_1/2 + l];
-        }
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-}
-
-static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
-    assert(k % QK8_1 == 0);
-    const int nb = k / QK8_1;
-
-    block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        float32x4_t srcv [8];
-        float32x4_t asrcv[8];
-        float32x4_t amaxv[8];
-
-        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
-        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
-
-        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
-        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
-        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
-
-        const float amax = vmaxvq_f32(amaxv[0]);
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        int32x4_t accv0 = vdupq_n_s32(0);
-        int32x4_t accv1 = vdupq_n_s32(0);
-
-        // low half
-        for (int l = 0; l < 4; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv0 = vaddq_s32(accv0, vi);
-        }
-
-        // high half
-        for (int l = 4; l < 8; l++) {
-            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
-            const int32x4_t   vi = vcvtnq_s32_f32(v);
-
-            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
-            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
-            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
-
-            accv1 = vaddq_s32(accv1, vi);
-        }
-
-        const int32_t sum0 = vaddvq_s32(accv0);
-        const int32_t sum1 = vaddvq_s32(accv1);
-
-        y[i].s0 = d * sum0;
-        y[i].s1 = d * sum1;
-    }
-#elif defined(__AVX2__) || defined(__AVX__)
-    for (int i = 0; i < nb; i++) {
-        // Load elements into 4 AVX vectors
-        __m256 v0 = _mm256_loadu_ps( x );
-        __m256 v1 = _mm256_loadu_ps( x + 8 );
-        __m256 v2 = _mm256_loadu_ps( x + 16 );
-        __m256 v3 = _mm256_loadu_ps( x + 24 );
-        x += 32;
-
-        // Compute max(abs(e)) for the block
-        const __m256 signBit = _mm256_set1_ps( -0.0f );
-        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
-        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
-
-        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
-        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
-        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
-        const float maxScalar = _mm_cvtss_f32( max4 );
-
-        // Quantize these floats
-        const float d = maxScalar / 127.f;
-        y[i].d = d;
-        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
-        const __m256 mul = _mm256_set1_ps( id );
-
-        // Apply the multiplier
-        v0 = _mm256_mul_ps( v0, mul );
-        v1 = _mm256_mul_ps( v1, mul );
-        v2 = _mm256_mul_ps( v2, mul );
-        v3 = _mm256_mul_ps( v3, mul );
-
-        // Round to nearest integer
-        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
-        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
-        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
-        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
-
-        // Convert floats to integers
-        __m256i i0 = _mm256_cvtps_epi32( v0 );
-        __m256i i1 = _mm256_cvtps_epi32( v1 );
-        __m256i i2 = _mm256_cvtps_epi32( v2 );
-        __m256i i3 = _mm256_cvtps_epi32( v3 );
-
-#if defined(__AVX2__)
-        // Compute the sum of the quants and set y[i].s
-        //y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
-        y[i].s0 = d * hsum_i32_8(_mm256_add_epi32(i0, i1));
-        y[i].s1 = d * hsum_i32_8(_mm256_add_epi32(i2, i3));
-
-        // Convert int32 to int16
-        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
-        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
-                                            // Convert int16 to int8
-        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-
-        // We got our precious signed bytes, but the order is now wrong
-        // These AVX2 pack instructions process 16-byte pieces independently
-        // The following instruction is fixing the order
-        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
-        i0 = _mm256_permutevar8x32_epi32( i0, perm );
-
-        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
-#else
-        // Since we don't have in AVX some necessary functions,
-        // we split the registers in half and call AVX2 analogs from SSE
-        __m128i ni0 = _mm256_castsi256_si128( i0 );
-        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
-        __m128i ni2 = _mm256_castsi256_si128( i1 );
-        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
-        __m128i ni4 = _mm256_castsi256_si128( i2 );
-        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
-        __m128i ni6 = _mm256_castsi256_si128( i3 );
-        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
-
-        // Compute the sum of the quants and set y[i].s
-        const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
-        const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
-        y[i].s0 = d * hsum_i32_4(s0);
-        y[i].s1 = d * hsum_i32_4(s1);
-
-        // Convert int32 to int16
-        ni0 = _mm_packs_epi32( ni0, ni1 );
-        ni2 = _mm_packs_epi32( ni2, ni3 );
-        ni4 = _mm_packs_epi32( ni4, ni5 );
-        ni6 = _mm_packs_epi32( ni6, ni7 );
-        // Convert int16 to int8
-        ni0 = _mm_packs_epi16( ni0, ni2 );
-        ni4 = _mm_packs_epi16( ni4, ni6 );
-
-        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
-        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
-#endif
-    }
-#else
-    // scalar
-    quantize_row_q8_1_reference(x, y, k);
-#endif
-}
-
-static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    const block_q4_0 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        // scale factor
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Subtract 8 from the integers
-            vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_mul_ps(vf[j], d_v);
-                _mm256_storeu_ps(y + i * QK4_0 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Convert to signed 8-bit integers
-            const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
-            const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
-
-            // Subtract 8 from each byte
-            const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
-            const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
-
-            // Interleave and combine
-            const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
-            const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
-
-            const int8x16_t vq = vcombine_s8(vx_0, vx_1);
-
-            // convert to 2x int16x8_t
-            const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
-            const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
-
-            // Multiply by d
-            const float32x4_t r0 = vmulq_f32(vf_0, vd);
-            const float32x4_t r1 = vmulq_f32(vf_1, vd);
-            const float32x4_t r2 = vmulq_f32(vf_2, vd);
-            const float32x4_t r3 = vmulq_f32(vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_0 + l +  0, r0);
-            vst1q_f32(y + i*QK4_0 + l +  4, r1);
-            vst1q_f32(y + i*QK4_0 + l +  8, r2);
-            vst1q_f32(y + i*QK4_0 + l + 12, r3);
-        }
-    }
-#else
-    // scalar
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            //printf("d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\n", d, vi, vi0, vi1, v0, v1);
-
-            y[i*QK4_0 + l + 0] = v0;
-            y[i*QK4_0 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_0 + l + 0]));
-            assert(!isnan(y[i*QK4_0 + l + 1]));
-        }
-    }
-#endif
-}
-
-static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    const block_q4_1 * restrict x = vx;
-
-#if defined(__AVX2__)
-    for (int i = 0; i < nb; i++) {
-        const __m256 d_v = _mm256_broadcast_ss(&x[i].d);
-        const __m256 d_m = _mm256_broadcast_ss(&x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 32) {
-            // Load 32x4-bit integers into 32x8-bit integers
-            __m256i vx8 = bytes_from_nibbles_32(pp+l/2);
-
-            // Convert to 16-bit int
-            const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
-            const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
-
-            // Convert to 32-bit int -> float 32
-            const __m256 vf[4] = {
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
-                _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
-            };
-
-            // Scale, add m and store
-            for (int j = 0; j < 4; j++) {
-                const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
-                _mm256_storeu_ps(y + i * QK4_1 + l + j*8, result);
-            }
-        }
-    }
-#elif defined(__ARM_NEON)
-    for (int i = 0; i < nb; i++) {
-        const float32x4_t vd = vdupq_n_f32(x[i].d);
-        const float32x4_t vm = vdupq_n_f32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 16) {
-            // Load 16x4-bit integers into 8x8-bit integers
-            const uint8x8_t v8 = vld1_u8(pp + l/2);
-
-            // Expand 4-bit qs to 8-bit bytes
-            const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0F));
-            const uint8x8_t v1 = vshr_n_u8(v8, 4);
-
-            // Interleave and combine
-            const uint8x8_t vx_0 = vzip1_u8(v0, v1);
-            const uint8x8_t vx_1 = vzip2_u8(v0, v1);
-
-            const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
-
-            // convert to 2x uint16x8_t
-            const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
-            const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
-
-            // convert to 4x float32x4_t
-            const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
-            const float32x4_t vf_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_0)));
-            const float32x4_t vf_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_1)));
-            const float32x4_t vf_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vi_1)));
-
-            // multiply by d and add m
-            const float32x4_t r0 = vmlaq_f32(vm, vf_0, vd);
-            const float32x4_t r1 = vmlaq_f32(vm, vf_1, vd);
-            const float32x4_t r2 = vmlaq_f32(vm, vf_2, vd);
-            const float32x4_t r3 = vmlaq_f32(vm, vf_3, vd);
-
-            // Store
-            vst1q_f32(y + i*QK4_1 + l +  0, r0);
-            vst1q_f32(y + i*QK4_1 + l +  4, r1);
-            vst1q_f32(y + i*QK4_1 + l +  8, r2);
-            vst1q_f32(y + i*QK4_1 + l + 12, r3);
-        }
-    }
-#else
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-        const float m = x[i].m;
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*QK4_1 + l + 0] = v0;
-            y[i*QK4_1 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_1 + l + 0]));
-            assert(!isnan(y[i*QK4_1 + l + 1]));
-        }
-    }
-#endif
-}
-
-static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    const block_q4_2 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK4_2; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            const int8_t vi0 = vi & 0x0F;
-            const int8_t vi1 = vi >> 4;
-
-            const float v0 = (vi0 - 8)*d;
-            const float v1 = (vi1 - 8)*d;
-
-            y[i*QK4_2 + l + 0] = v0;
-            y[i*QK4_2 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK4_2 + l + 0]));
-            assert(!isnan(y[i*QK4_2 + l + 1]));
-        }
-    }
-}
-
-static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
-
-    const block_q5_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int l = 0; l < QK5_0; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const int8_t vi0 = (vi & 0x0F) | vh0;
-            const int8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = (vi0 - 16)*d;
-            const float v1 = (vi1 - 16)*d;
-
-            y[i*QK5_0 + l + 0] = v0;
-            y[i*QK5_0 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK5_0 + l + 0]));
-            assert(!isnan(y[i*QK5_0 + l + 1]));
-        }
-    }
-}
-
-static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
-
-    const block_q5_1 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        const uint8_t * restrict pp = x[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        for (int l = 0; l < QK5_1; l += 2) {
-            const uint8_t vi = pp[l/2];
-
-            // extract the 5-th bit from qh
-            const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-            const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-            const uint8_t vi0 = (vi & 0x0F) | vh0;
-            const uint8_t vi1 = (vi >>   4) | vh1;
-
-            const float v0 = vi0*d + m;
-            const float v1 = vi1*d + m;
-
-            y[i*QK5_1 + l + 0] = v0;
-            y[i*QK5_1 + l + 1] = v1;
-
-            assert(!isnan(y[i*QK5_1 + l + 0]));
-            assert(!isnan(y[i*QK5_1 + l + 1]));
-        }
-    }
-}
-
-static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    const block_q8_0 * restrict x = vx;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = x[i].d;
-
-        const int8_t * restrict pp = x[i].qs;
-
-        for (int l = 0; l < QK8_0; ++l) {
-            y[i*QK8_0 + l] = pp[l]*d;
-        }
-    }
-}
-
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-
-static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_Q4_0] = {
-        .dequantize_row_q         = dequantize_row_q4_0,
-        .quantize_row_q           = quantize_row_q4_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q4_1] = {
-        .dequantize_row_q         = dequantize_row_q4_1,
-        .quantize_row_q           = quantize_row_q4_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = ggml_vec_dot_q4_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q4_2] = {
-        .dequantize_row_q         = dequantize_row_q4_2,
-        .quantize_row_q           = quantize_row_q4_2,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q4_2_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_0] = {
-        .dequantize_row_q         = dequantize_row_q5_0,
-        .quantize_row_q           = quantize_row_q5_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q5_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q5_1] = {
-        .dequantize_row_q         = dequantize_row_q5_1,
-        .quantize_row_q           = quantize_row_q5_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q5_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = ggml_vec_dot_q5_1_q8_1,
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-    [GGML_TYPE_Q8_0] = {
-        .dequantize_row_q         = dequantize_row_q8_0,
-        .quantize_row_q           = quantize_row_q8_0,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0_reference,
-        .quantize_row_q_dot       = quantize_row_q8_0,
-        .vec_dot_q                = ggml_vec_dot_q8_0_q8_0,
-        .vec_dot_type             = GGML_TYPE_Q8_0,
-    },
-    [GGML_TYPE_Q8_1] = {
-        .dequantize_row_q         = NULL,   // TODO
-        .quantize_row_q           = quantize_row_q8_1,
-        .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_1_reference,
-        .quantize_row_q_dot       = quantize_row_q8_1,
-        .vec_dot_q                = NULL,   // TODO
-        .vec_dot_type             = GGML_TYPE_Q8_1,
-    },
-};
-
-// For internal test use
-quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
-    GGML_ASSERT(i < GGML_TYPE_COUNT);
-    return quantize_fns[i];
-}
-
-
-//
-// simd mappings
-//
-
-// we define a common set of C macros which map to specific intrinsics based on the current architecture
-// we then implement the fundamental computation operations below using only these macros
-// adding support for new architectures requires to define the corresponding SIMD macros
-//
-// GGML_F32_STEP / GGML_F16_STEP
-//   number of elements to process in a single step
-//
-// GGML_F32_EPR / GGML_F16_EPR
-//   number of elements to fit in a single register
-//
-
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
-
-#define GGML_SIMD
-
-// F32 NEON
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              float32x4_t
-#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
-#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
-#define GGML_F32x4_LOAD         vld1q_f32
-#define GGML_F32x4_STORE        vst1q_f32
-#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
-#define GGML_F32x4_ADD          vaddq_f32
-#define GGML_F32x4_MUL          vmulq_f32
-#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
-        x[2*i] = vaddq_f32(x[2*i], x[2*i+1]);  \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
-        x[4*i] = vaddq_f32(x[4*i], x[4*i+2]);  \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
-        x[8*i] = vaddq_f32(x[8*i], x[8*i+4]);  \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 NEON
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    #define GGML_F16_STEP 32
-    #define GGML_F16_EPR  8
-
-    #define GGML_F16x8              float16x8_t
-    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
-    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
-    #define GGML_F16x8_LOAD         vld1q_f16
-    #define GGML_F16x8_STORE        vst1q_f16
-    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
-    #define GGML_F16x8_ADD          vaddq_f16
-    #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    {                                                             \
-        for (int i = 0; i < GGML_F16_ARR/2; ++i) {                \
-            x[2*i] = vaddq_f16(x[2*i], x[2*i+1]);                 \
-        }                                                         \
-        for (int i = 0; i < GGML_F16_ARR/4; ++i) {                \
-            x[4*i] = vaddq_f16(x[4*i], x[4*i+2]);                 \
-        }                                                         \
-        for (int i = 0; i < GGML_F16_ARR/8; ++i) {                \
-            x[8*i] = vaddq_f16(x[8*i], x[8*i+4]);                 \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
-    }
-
-    #define GGML_F16_VEC                GGML_F16x8
-    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
-    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
-    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
-#else
-    // if FP16 vector arithmetic is not supported, we use FP32 instead
-    // and take advantage of the vcvt_ functions to convert to/from FP16
-
-    #define GGML_F16_STEP 16
-    #define GGML_F16_EPR  4
-
-    #define GGML_F32Cx4              float32x4_t
-    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
-    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
-    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))
-    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
-    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
-    #define GGML_F32Cx4_ADD          vaddq_f32
-    #define GGML_F32Cx4_MUL          vmulq_f32
-    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
-
-    #define GGML_F16_VEC                GGML_F32Cx4
-    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
-    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
-    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
-    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
-    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
-    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
-    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
-#endif
-
-#elif defined(__AVX__)
-
-#define GGML_SIMD
-
-// F32 AVX
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  8
-
-#define GGML_F32x8         __m256
-#define GGML_F32x8_ZERO    _mm256_setzero_ps()
-#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
-#define GGML_F32x8_LOAD    _mm256_loadu_ps
-#define GGML_F32x8_STORE   _mm256_storeu_ps
-#if defined(__FMA__)
-    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
-#endif
-#define GGML_F32x8_ADD     _mm256_add_ps
-#define GGML_F32x8_MUL     _mm256_mul_ps
-#define GGML_F32x8_REDUCE(res, x)                                 \
-{                                                                 \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \
-        x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]);                 \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \
-        x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]);                 \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \
-        x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]);                 \
-    }                                                             \
-    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
-                                 _mm256_extractf128_ps(x[0], 1)); \
-    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x8
-#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
-
-// F16 AVX
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  8
-
-// F16 arithmetic is not supported by AVX, so we use F32 instead
-
-#define GGML_F32Cx8             __m256
-#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
-#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
-
-#if defined(__F16C__)
-// the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
-#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
-#else
-static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
-    float tmp[8];
-
-    for (int i = 0; i < 8; i++)
-        tmp[i] = GGML_FP16_TO_FP32(x[i]);
-
-    return _mm256_loadu_ps(tmp);
-}
-static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
-    float arr[8];
-
-    _mm256_storeu_ps(arr, y);
-
-    for (int i = 0; i < 8; i++)
-        x[i] = GGML_FP32_TO_FP16(arr[i]);
-}
-#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
-#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
-#endif
-
-#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
-#define GGML_F32Cx8_ADD         _mm256_add_ps
-#define GGML_F32Cx8_MUL         _mm256_mul_ps
-#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
-
-#define GGML_F16_VEC                GGML_F32Cx8
-#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
-#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
-#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
-#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
-
-#elif defined(__POWER9_VECTOR__)
-
-#define GGML_SIMD
-
-// F32 POWER9
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              vector float
-#define GGML_F32x4_ZERO         0.0f
-#define GGML_F32x4_SET1         vec_splats
-#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
-#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
-#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
-#define GGML_F32x4_ADD          vec_add
-#define GGML_F32x4_MUL          vec_mul
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
-        x[2*i] = vec_add(x[2*i], x[2*i+1]);    \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \
-        x[4*i] = vec_add(x[4*i], x[4*i+2]);    \
-    }                                          \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \
-        x[8*i] = vec_add(x[8*i], x[8*i+4]);    \
-    }                                          \
-    res = vec_extract(x[0], 0) +               \
-          vec_extract(x[0], 1) +               \
-          vec_extract(x[0], 2) +               \
-          vec_extract(x[0], 3);                \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 POWER9
-#define GGML_F16_STEP       GGML_F32_STEP
-#define GGML_F16_EPR        GGML_F32_EPR
-#define GGML_F16_VEC        GGML_F32x4
-#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
-// Use vec_xl, not vec_ld, in case the load address is not aligned.
-#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
-  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
-  vec_extract_fp32_from_shortl(vec_xl(0, p))
-#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
-#define GGML_F16_VEC_STORE(p, r, i)                             \
-  if (i & 0x1)                                                  \
-    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
-                                   r[i - GGML_ENDIAN_BYTE(0)]), \
-            0, p - GGML_F16_EPR)
-
-#elif defined(__wasm_simd128__)
-
-#define GGML_SIMD
-
-// F32 WASM
-
-#define GGML_F32_STEP 16
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4              v128_t
-#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
-#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
-#define GGML_F32x4_LOAD         wasm_v128_load
-#define GGML_F32x4_STORE        wasm_v128_store
-#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
-#define GGML_F32x4_ADD          wasm_f32x4_add
-#define GGML_F32x4_MUL          wasm_f32x4_mul
-#define GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {     \
-        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
-    }                                              \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {     \
-        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
-    }                                              \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {     \
-        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 WASM
-
-#define GGML_F16_STEP 16
-#define GGML_F16_EPR  4
-
-inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(p[0]);
-    tmp[1] = GGML_FP16_TO_FP32(p[1]);
-    tmp[2] = GGML_FP16_TO_FP32(p[2]);
-    tmp[3] = GGML_FP16_TO_FP32(p[3]);
-
-    return wasm_v128_load(tmp);
-}
-
-inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
-    float tmp[4];
-
-    wasm_v128_store(tmp, x);
-
-    p[0] = GGML_FP32_TO_FP16(tmp[0]);
-    p[1] = GGML_FP32_TO_FP16(tmp[1]);
-    p[2] = GGML_FP32_TO_FP16(tmp[2]);
-    p[3] = GGML_FP32_TO_FP16(tmp[3]);
-}
-
-#define GGML_F16x4             v128_t
-#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
-#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
-#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
-#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
-#define GGML_F16x4_FMA         GGML_F32x4_FMA
-#define GGML_F16x4_ADD         wasm_f32x4_add
-#define GGML_F16x4_MUL         wasm_f32x4_mul
-#define GGML_F16x4_REDUCE(res, x)                  \
-{                                                  \
-    for (int i = 0; i < GGML_F16_ARR/2; ++i) {     \
-        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \
-    }                                              \
-    for (int i = 0; i < GGML_F16_ARR/4; ++i) {     \
-        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \
-    }                                              \
-    for (int i = 0; i < GGML_F16_ARR/8; ++i) {     \
-        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \
-    }                                              \
-    res = wasm_f32x4_extract_lane(x[0], 0) +       \
-          wasm_f32x4_extract_lane(x[0], 1) +       \
-          wasm_f32x4_extract_lane(x[0], 2) +       \
-          wasm_f32x4_extract_lane(x[0], 3);        \
-}
-
-#define GGML_F16_VEC                GGML_F16x4
-#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
-#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
-#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
-#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
-#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
-#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
-
-#elif defined(__SSE3__)
-
-#define GGML_SIMD
-
-// F32 SSE
-
-#define GGML_F32_STEP 32
-#define GGML_F32_EPR  4
-
-#define GGML_F32x4         __m128
-#define GGML_F32x4_ZERO    _mm_setzero_ps()
-#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
-#define GGML_F32x4_LOAD    _mm_loadu_ps
-#define GGML_F32x4_STORE   _mm_storeu_ps
-#if defined(__FMA__)
-    // TODO: Does this work?
-    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
-#else
-    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
-#endif
-#define GGML_F32x4_ADD     _mm_add_ps
-#define GGML_F32x4_MUL     _mm_mul_ps
-#define GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \
-        x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]);                    \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \
-        x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]);                    \
-    }                                                             \
-    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \
-        x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]);                    \
-    }                                                             \
-    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
-    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \
-}
-// TODO: is this optimal ?
-
-#define GGML_F32_VEC        GGML_F32x4
-#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
-#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
-#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
-#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
-#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
-#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
-#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
-#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
-
-// F16 SSE
-
-#define GGML_F16_STEP 32
-#define GGML_F16_EPR  4
-
-static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {
-    float tmp[4];
-
-    tmp[0] = GGML_FP16_TO_FP32(x[0]);
-    tmp[1] = GGML_FP16_TO_FP32(x[1]);
-    tmp[2] = GGML_FP16_TO_FP32(x[2]);
-    tmp[3] = GGML_FP16_TO_FP32(x[3]);
-
-    return _mm_loadu_ps(tmp);
-}
-
-static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {
-    float arr[4];
-
-    _mm_storeu_ps(arr, y);
-
-    x[0] = GGML_FP32_TO_FP16(arr[0]);
-    x[1] = GGML_FP32_TO_FP16(arr[1]);
-    x[2] = GGML_FP32_TO_FP16(arr[2]);
-    x[3] = GGML_FP32_TO_FP16(arr[3]);
-}
-
-#define GGML_F32Cx4             __m128
-#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
-#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
-#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
-#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
-#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
-#define GGML_F32Cx4_ADD         _mm_add_ps
-#define GGML_F32Cx4_MUL         _mm_mul_ps
-#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
-
-#define GGML_F16_VEC                 GGML_F32Cx4
-#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
-#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
-#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
-#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
-#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
-#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
-#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
-#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
-
-#endif
-
-// GGML_F32_ARR / GGML_F16_ARR
-//   number of registers to use per step
-#ifdef GGML_SIMD
-#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
-#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
-#endif
-
-//
-// fundamental operations
-//
-
-inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
-inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
-inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
-inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
-inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
-
-inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {
-#ifdef GGML_SIMD
-    float sumf = 0.0f;
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-
-            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F32_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += x[i]*y[i];
-    }
-#else
-    // scalar
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
-    }
-#endif
-
-    *s = sumf;
-}
-
-inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
-    ggml_float sumf = 0.0;
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    GGML_F16_VEC_REDUCE(sumf, sum);
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
-    }
-#endif
-
-    *s = sumf;
-}
-
-static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-
-    const block_q4_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_0 * restrict x0 = &x[i + 0];
-        const block_q4_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8( 8 );
-        bx = _mm256_sub_epi8( bx, off );
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#elif defined(__AVX__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-
-        __m128i i32[2];
-        for (int j = 0; j < 2; ++j) {
-            // Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
-            __m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
-            __m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
-
-            // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-            const __m128i off = _mm_set1_epi8( 8 );
-            bx = _mm_sub_epi8( bx, off );
-
-            // Get absolute values of x vectors
-            const __m128i ax = _mm_sign_epi8(bx, bx);
-
-            // Sign the values of the y vectors
-            const __m128i sy = _mm_sign_epi8(by, bx);
-
-            // Perform multiplication and create 16-bit values
-            const __m128i dot = _mm_maddubs_epi16(ax, sy);
-
-            const __m128i ones = _mm_set1_epi16(1);
-            i32[j] = _mm_madd_epi16(ones, dot);
-        }
-
-        // Convert int32_t to float
-        __m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
-        // Apply the scale, and accumulate
-        acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        int sumi = 0;
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const int i0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1 = (int8_t) (v0 >>   4) - 8;
-
-            const int i2 = p1[2*j + 0];
-            const int i3 = p1[2*j + 1];
-
-            sumi += i0*i2 + i1*i3;
-        }
-        sumf += d0*d1*sumi;
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_1;
-
-    assert(n % QK8_1 == 0);
-    assert(nb % 2 == 0);
-
-    const block_q4_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-    // TODO: add AVX / WASM SIMD / etc
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    float summs = 0;
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_1 * restrict x0 = &x[i + 0];
-        const block_q4_1 * restrict x1 = &x[i + 1];
-        const block_q8_1 * restrict y0 = &y[i + 0];
-        const block_q8_1 * restrict y1 = &y[i + 1];
-
-        summs += x0->m * (y0->s0 + y0->s1) + x1->m * (y1->s0 + y1->s1);
-
-        const uint8x16_t m4b = vdupq_n_u8(0x0F);
-
-        const uint8x16_t v0_0 = vld1q_u8(x0->qs);
-        const uint8x16_t v0_1 = vld1q_u8(x1->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    float summs = 0;
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        const float * d0 = &x[i].d;
-        const float * d1 = &y[i].d;
-
-        summs += x[i].m * (y[i].s0 + y[i].s1);
-
-        const __m256 d0v = _mm256_broadcast_ss( d0 );
-        const __m256 d1v = _mm256_broadcast_ss( d1 );
-
-        // Compute combined scales
-        const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
-
-        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
-
-        const __m256 xy = mul_sum_i8_pairs_float(bx, by);
-
-        // Accumulate d0*d1*x*y
-        acc = _mm256_fmadd_ps( d0d1, xy, acc );
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float m0 = x[i].m;
-        const float d1 = y[i].d;
-
-        const uint8_t * restrict p0 = x[i].qs;
-        const  int8_t * restrict p1 = y[i].qs;
-
-        // TODO: this is very slow ..
-        for (int j = 0; j < QK8_1/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const float f0 = d0*(v0 & 0x0F) + m0;
-            const float f1 = d0*(v0 >>   4) + m0;
-
-            const float f2 = d1*p1[2*j + 0];
-            const float f3 = d1*p1[2*j + 1];
-
-            sumf += f0*f2 + f1*f3;
-        }
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_0 == 2*QK4_2);
-
-    const block_q4_2 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q4_2 * restrict x0_0 = &x[2*(i + 0) + 0];
-        const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
-        const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
-        const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
-
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const uint8x16_t m4b   = vdupq_n_u8(0x0F);
-        const int8x16_t  s8b   = vdupq_n_s8(0x8);
-
-        const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
-        const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8  (v0_0, m4b));
-        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
-        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8  (v0_1, m4b));
-        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
-
-        // sub 8
-        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
-        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
-        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
-        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
-
-        // interleave
-        const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
-        const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
-        const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
-
-        // load y
-        const int8x16_t v1_0l = vld1q_s8(y0->qs);
-        const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
-        const int8x16_t v1_1l = vld1q_s8(y1->qs);
-        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl0), GGML_FP16_TO_FP32(x0_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph0), GGML_FP16_TO_FP32(x0_1->d))), y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vaddq_f32(
-                vmulq_n_f32(vcvtq_f32_s32(pl1), GGML_FP16_TO_FP32(x1_0->d)),
-                vmulq_n_f32(vcvtq_f32_s32(ph1), GGML_FP16_TO_FP32(x1_1->d))), y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
-        const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
-        const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
-
-        __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
-        __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
-        __m256i bx = _mm256_set_m128i(bx1, bx0);
-
-        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
-        const __m256i off = _mm256_set1_epi8(8);
-        bx = _mm256_sub_epi8(bx, off);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[2*i + 0].qs;
-        const uint8_t * restrict x1 = x[2*i + 1].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
-        const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
-
-        int sumi_0 = 0;
-        int sumi_1 = 0;
-
-        for (int j = 0; j < QK8_0/4; j++) {
-            const uint8_t v0 = x0[j];
-            const uint8_t v1 = x1[j];
-
-            const int i0_0 = (int8_t) (v0 & 0x0F) - 8;
-            const int i1_0 = (int8_t) (v0 >>   4) - 8;
-
-            const int i0_1 = (int8_t) (v1 & 0x0F) - 8;
-            const int i1_1 = (int8_t) (v1 >>   4) - 8;
-
-            const int i2_0 = y0[2*j + 0];
-            const int i3_0 = y0[2*j + 1];
-
-            const int i2_1 = y0[2*(j + QK8_0/4) + 0];
-            const int i3_1 = y0[2*(j + QK8_0/4) + 1];
-
-            sumi_0 += i0_0*i2_0 + i1_0*i3_0;
-            sumi_1 += i0_1*i2_1 + i1_1*i3_1;
-        }
-
-        sumf += (d0 * y[i].d) * sumi_0;
-        sumf += (d1 * y[i].d) * sumi_1;
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_0 == QK5_0);
-
-    const block_q5_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
-
-        const uint8x16_t m4b  = vdupq_n_u8(0x0F);
-        const int8x16_t  s16b = vdupq_n_s8(0x10);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
-
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
-
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, m4b));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
-
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
-        // add high bit and sub 16
-        const int8x16_t v0lf = vsubq_s8(vorrq_s8(v0lz, qhl), s16b);
-        const int8x16_t v0hf = vsubq_s8(vorrq_s8(v0hz, qhh), s16b);
-
-        // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv);
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_0 * restrict x0 = &x[i];
-        const block_q8_0 * restrict y0 = &y[i];
-
-        const v128_t m4b  = wasm_i8x16_splat(0x0F);
-        const v128_t s16b = wasm_i8x16_splat(0x10);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-        // add high bit and sub 16
-        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
-        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        /* Compute combined scale for the block */
-        const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        /* Multiply q with scale and accumulate */
-        acc = _mm256_fmadd_ps(d, q, acc);
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        int sxy = 0;
-
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
-
-            const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
-            const int x1_0 = ((v0 >>   4) | x1_0h) - 16;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            sxy += x0_0*y0_0 + x1_0*y1_0;
-        }
-
-        sumf += (d*sxy)*y[i].d;
-    }
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_1;
-
-    assert(n % QK8_1 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_1 == QK5_1);
-
-    const block_q5_1 * restrict x = vx;
-    const block_q8_1 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv = vdupq_n_f32(0.0f);
-
-    float summs = 0.0f;
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
-
-        const int8x16_t qhl = vld1q_s8((const int8_t *)(tmp + 0));
-        const int8x16_t qhh = vld1q_s8((const int8_t *)(tmp + 2));
-
-        const uint8x16_t v0 = vld1q_u8(x0->qs);
-
-        // 4-bit -> 8-bit
-        const int8x16_t v0l = vreinterpretq_s8_u8(vandq_u8  (v0, vdupq_n_u8(0x0F)));
-        const int8x16_t v0h = vreinterpretq_s8_u8(vshrq_n_u8(v0, 4));
-
-        // interleave
-        const int8x16_t v0lz = vzip1q_s8(v0l, v0h);
-        const int8x16_t v0hz = vzip2q_s8(v0l, v0h);
-
-        // add
-        const int8x16_t v0lf = vorrq_s8(v0lz, qhl);
-        const int8x16_t v0hf = vorrq_s8(v0hz, qhh);
-
-        // load y
-        const int8x16_t v1l = vld1q_s8(y0->qs);
-        const int8x16_t v1h = vld1q_s8(y0->qs + 16);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0lf, v1l),
-                        vdotq_s32(vdupq_n_s32(0), v0hf, v1h))), x0d*y0->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0lf), vget_low_s8 (v1l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0lf), vget_high_s8(v1l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0hf), vget_low_s8 (v1h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0hf), vget_high_s8(v1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-
-        sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), x0d*y0->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv) + summs;
-#elif defined(__wasm_simd128__)
-    v128_t sumv = wasm_f32x4_splat(0.0f);
-
-    float summs = 0.0f;
-
-    uint64_t tmp[4];
-
-    for (int i = 0; i < nb; ++i) {
-        const block_q5_1 * restrict x0 = &x[i];
-        const block_q8_1 * restrict y0 = &y[i];
-
-        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
-
-        const v128_t m4b = wasm_i8x16_splat(0x0F);
-
-        // extract the 5th bit
-        uint32_t qh;
-        memcpy(&qh, x0->qh, sizeof(qh));
-
-        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
-        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
-        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
-        tmp[3] = table_b2b_u[(qh >> 24)       ];
-
-        const v128_t qhl = wasm_v128_load(tmp + 0);
-        const v128_t qhh = wasm_v128_load(tmp + 2);
-
-        const v128_t v0 = wasm_v128_load(x0->qs);
-
-        // 4-bit -> 8-bit
-        const v128_t v0l = wasm_v128_and (v0, m4b);
-        const v128_t v0h = wasm_u8x16_shr(v0, 4);
-
-        static bool x = true;
-
-        // interleave
-        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
-        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-        // add high bit
-        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
-        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
-
-        // load y
-        const v128_t v1l = wasm_v128_load(y0->qs);
-        const v128_t v1h = wasm_v128_load(y0->qs + 16);
-
-        // int8x16 -> int16x8
-        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
-        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
-        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
-        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
-
-        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
-        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
-        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
-        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
-
-        const float x0d = GGML_FP16_TO_FP32(x0->d);
-
-        // dot product
-        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
-                        wasm_i32x4_add(
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
-                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
-                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
-                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
-    }
-
-    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
-         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-    float summs = 0.0f;
-
-    // Main loop
-    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
-
-        summs += GGML_FP16_TO_FP32(x[i].m) * (y[i].s0 + y[i].s1);
-
-        __m256i bx = bytes_from_nibbles_32(x[i].qs);
-        __m256i bxhi = bytes_from_bits_32(x[i].qh);
-        bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
-        bx = _mm256_or_si256(bx, bxhi);
-
-        const __m256 dy = _mm256_broadcast_ss(&y[i].d);
-        const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
-    }
-
-    *s = hsum_float_8(acc) + summs;
-#else
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        const uint8_t * restrict x0 = x[i].qs;
-        const  int8_t * restrict y0 = y[i].qs;
-
-        uint32_t qh;
-        memcpy(&qh, x[i].qh, sizeof(qh));
-
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
-
-        int sxy = 0;
-
-        for (int j = 0; j < QK8_1/2; j++) {
-            const uint8_t v0 = x0[j];
-
-            const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
-            const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
-
-            const int x0_0 = (v0 & 0x0F) | x0_0h;
-            const int x1_0 = (v0 >>   4) | x1_0h;
-
-            const int y0_0 = y0[2*j + 0];
-            const int y1_0 = y0[2*j + 1];
-
-            sxy += x0_0*y0_0 + x1_0*y1_0;
-        }
-
-        sumf += (d*sxy)*y[i].d + m*(y[i].s0 + y[i].s1);
-    }
-
-    *s = sumf;
-#endif
-}
-
-static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
-    const int nb = n / QK8_0;
-
-    assert(n % QK8_0 == 0);
-    assert(nb % 2 == 0);
-    assert(QK8_0 == QK8_0);
-
-    const block_q8_0 * restrict x = vx;
-    const block_q8_0 * restrict y = vy;
-
-#if defined(__ARM_NEON)
-    float32x4_t sumv0 = vdupq_n_f32(0.0f);
-    float32x4_t sumv1 = vdupq_n_f32(0.0f);
-
-    for (int i = 0; i < nb; i += 2) {
-        const block_q8_0 * restrict x0 = &x[i + 0];
-        const block_q8_0 * restrict x1 = &x[i + 1];
-        const block_q8_0 * restrict y0 = &y[i + 0];
-        const block_q8_0 * restrict y1 = &y[i + 1];
-
-        const int8x16_t x0_0 = vld1q_s8(x0->qs);
-        const int8x16_t x0_1 = vld1q_s8(x0->qs + 16);
-        const int8x16_t x1_0 = vld1q_s8(x1->qs);
-        const int8x16_t x1_1 = vld1q_s8(x1->qs + 16);
-
-        // load y
-        const int8x16_t y0_0 = vld1q_s8(y0->qs);
-        const int8x16_t y0_1 = vld1q_s8(y0->qs + 16);
-        const int8x16_t y1_0 = vld1q_s8(y1->qs);
-        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
-
-#if defined(__ARM_FEATURE_DOTPROD)
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), x0->d*y0->d);
-
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), x1->d*y1->d);
-
-#else
-        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
-        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
-        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
-        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
-        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), x0->d*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), x1->d*y1->d);
-#endif
-    }
-
-    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
-#elif defined(__AVX2__)
-    // Initialize accumulator with zeros
-    __m256 acc = _mm256_setzero_ps();
-
-    // Main loop
-    for (int i = 0; i < nb; ++i) {
-        // Compute combined scale for the block
-        const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
-        __m256i bx = _mm256_loadu_si256((const __m256i *)x[i].qs);
-        __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
-
-        const __m256 q = mul_sum_i8_pairs_float(bx, by);
-
-        // Multiply q with scale and accumulate
-        acc = _mm256_fmadd_ps( d, q, acc );
-    }
-
-    *s = hsum_float_8(acc);
-#else
-    // scalar
-    float sumf = 0.0;
-
-    for (int i = 0; i < nb; i++) {
-        const int8_t * restrict x0 = x[i].qs;
-        const int8_t * restrict y0 = y[i].qs;
-
-        int sumi = 0;
-
-        for (int j = 0; j < QK8_0; j++) {
-            const int v0 = x0[j];
-            const int v1 = y0[j];
-
-            sumi += v0*v1;
-        }
-
-        sumf += (x[i].d*y[i].d)*sumi;
-    }
-
-    *s = sumf;
-#endif
-}
-
-// compute GGML_VEC_DOT_UNROLL dot products at once
-// xs - x row stride in bytes
-inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
-    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
-
-    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
-    }
-
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F16_STEP - 1));
-
-    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
-
-    GGML_F16_VEC ax[GGML_F16_ARR];
-    GGML_F16_VEC ay[GGML_F16_ARR];
-
-    for (int i = 0; i < np; i += GGML_F16_STEP) {
-        for (int j = 0; j < GGML_F16_ARR; j++) {
-            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
-
-            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
-
-                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
-            }
-        }
-    }
-
-    // reduce sum0..sum3 to sum0
-    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
-        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#else
-    for (int i = 0; i < n; ++i) {
-        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
-            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
-        }
-    }
-#endif
-
-    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
-        s[i] = sumf[i];
-    }
-}
-
-inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ax[GGML_F32_ARR];
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] += x[i]*v;
-    }
-#endif
-}
-
-//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
-inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
-#if defined(GGML_SIMD)
-    const int np = (n & ~(GGML_F32_STEP - 1));
-
-    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
-
-    GGML_F32_VEC ay[GGML_F32_ARR];
-
-    for (int i = 0; i < np; i += GGML_F32_STEP) {
-        for (int j = 0; j < GGML_F32_ARR; j++) {
-            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
-            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
-
-            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
-        }
-    }
-
-    // leftovers
-    for (int i = np; i < n; ++i) {
-        y[i] *= v;
-    }
-#else
-    // scalar
-    for (int i = 0; i < n; ++i) {
-        y[i] *= v;
-    }
-#endif
-}
-
-inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
-inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
-inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
-inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
-inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
-inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-
-static const float GELU_COEF_A    = 0.044715f;
-static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-inline static float ggml_gelu_f32(float x) {
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
-}
-
-inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = table_gelu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_GELU_FP16
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_gelu_f32(x[i]);
-    }
-}
-#endif
-
-// Sigmoid Linear Unit (SiLU) function
-inline static float ggml_silu_f32(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
-    const uint16_t * i16 = (const uint16_t *) x;
-    for (int i = 0; i < n; ++i) {
-        y[i] = table_silu_f16[i16[i]];
-    }
-}
-
-#ifdef GGML_SILU_FP16
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    uint16_t t;
-    for (int i = 0; i < n; ++i) {
-        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
-        memcpy(&t, &fp16, sizeof(uint16_t));
-        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);
-    }
-}
-#else
-inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
-    for (int i = 0; i < n; ++i) {
-        y[i] = ggml_silu_f32(x[i]);
-    }
-}
-#endif
-
-inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-#else
-    vDSP_sve(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_sum_ggf(const int n, ggml_float * s, const float * x) {
-    ggml_float sum = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sum += (ggml_float)x[i];
-    }
-    *s = sum;
-}
-
-inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_USE_ACCELERATE
-    float max = -INFINITY;
-    for (int i = 0; i < n; ++i) {
-        max = MAX(max, x[i]);
-    }
-    *s = max;
-#else
-    vDSP_maxv(x, 1, s, n);
-#endif
-}
-
-inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
-    ggml_vec_norm_f32(n, s, x);
-    *s = 1.f/(*s);
-}
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-//
-// data types
-//
-
-static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = 1,
-    [GGML_TYPE_F16]  = 1,
-    [GGML_TYPE_Q4_0] = QK4_0,
-    [GGML_TYPE_Q4_1] = QK4_1,
-    [GGML_TYPE_Q4_2] = QK4_2,
-    [GGML_TYPE_Q5_0] = QK5_0,
-    [GGML_TYPE_Q5_1] = QK5_1,
-    [GGML_TYPE_Q8_0] = QK8_0,
-    [GGML_TYPE_Q8_1] = QK8_1,
-    [GGML_TYPE_I8]   = 1,
-    [GGML_TYPE_I16]  = 1,
-    [GGML_TYPE_I32]  = 1,
-};
-static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
-
-static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = sizeof(float),
-    [GGML_TYPE_F16]  = sizeof(ggml_fp16_t),
-    [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
-    [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
-    [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
-    [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
-    [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
-    [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
-    [GGML_TYPE_Q8_1] = sizeof(block_q8_1),
-    [GGML_TYPE_I8]   = sizeof(int8_t),
-    [GGML_TYPE_I16]  = sizeof(int16_t),
-    [GGML_TYPE_I32]  = sizeof(int32_t),
-};
-static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
-
-
-static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = "f32",
-    [GGML_TYPE_F16]  = "f16",
-    [GGML_TYPE_Q4_0] = "q4_0",
-    [GGML_TYPE_Q4_1] = "q4_1",
-    [GGML_TYPE_Q4_2] = "q4_2",
-    [GGML_TYPE_Q5_0] = "q5_0",
-    [GGML_TYPE_Q5_1] = "q5_1",
-    [GGML_TYPE_Q8_0] = "q8_0",
-    [GGML_TYPE_Q8_1] = "q8_1",
-    [GGML_TYPE_I8]   = "i8",
-    [GGML_TYPE_I16]  = "i16",
-    [GGML_TYPE_I32]  = "i32",
-};
-static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
-
-static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
-    [GGML_TYPE_F32]  = false,
-    [GGML_TYPE_F16]  = false,
-    [GGML_TYPE_Q4_0] = true,
-    [GGML_TYPE_Q4_1] = true,
-    [GGML_TYPE_Q4_2] = true,
-    [GGML_TYPE_Q5_0] = true,
-    [GGML_TYPE_Q5_1] = true,
-    [GGML_TYPE_Q8_0] = true,
-    [GGML_TYPE_Q8_1] = true,
-    [GGML_TYPE_I8]   = false,
-    [GGML_TYPE_I16]  = false,
-    [GGML_TYPE_I32]  = false,
-};
-static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
-
-static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
-    "NONE",
-
-    "DUP",
-    "ADD",
-    "SUB",
-    "MUL",
-    "DIV",
-    "SQR",
-    "SQRT",
-    "SUM",
-    "MEAN",
-    "REPEAT",
-    "ABS",
-    "SGN",
-    "NEG",
-    "STEP",
-    "RELU",
-    "GELU",
-    "SILU",
-    "NORM",
-    "RMS_NORM",
-
-    "MUL_MAT",
-
-    "SCALE",
-    "CPY",
-    "CONT",
-    "RESHAPE",
-    "VIEW",
-    "PERMUTE",
-    "TRANSPOSE",
-    "GET_ROWS",
-    "DIAG_MASK_INF",
-    "SOFT_MAX",
-    "ROPE",
-    "ALIBI",
-    "CONV_1D_1S",
-    "CONV_1D_2S",
-
-    "FLASH_ATTN",
-    "FLASH_FF",
-
-    "MAP_UNARY",
-    "MAP_BINARY",
-};
-
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
-
-static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
-    "none",
-
-    "x",
-    "x+y",
-    "x-y",
-    "x*y",
-    "x/y",
-    "x^2",
-    "√x",
-    "Σx",
-    "Σx/n",
-    "repeat(x)",
-    "abs(x)",
-    "sgn(x)",
-    "-x",
-    "step(x)",
-    "relu(x)",
-    "gelu(x)",
-    "silu(x)",
-    "norm(x)",
-    "rms_norm(x)",
-
-    "X*Y",
-
-    "x*v",
-    "x-\\>y",
-    "cont(x)",
-    "reshape(x)",
-    "view(x)",
-    "permute(x)",
-    "transpose(x)",
-    "get_rows(x)",
-    "diag_mask_inf(x)",
-    "soft_max(x)",
-    "rope(x)",
-    "alibi(x)",
-    "conv_1d_1s(x)",
-    "conv_1d_2s(x)",
-
-    "flash_attn(x)",
-    "flash_ff(x)",
-
-    "f(x)",
-    "f(x,y)",
-};
-
-static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
-
-static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
-static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
-
-//
-// ggml context
-//
-
-struct ggml_context {
-    size_t mem_size;
-    void * mem_buffer;
-    bool   mem_buffer_owned;
-    bool   no_alloc;
-
-    int    n_objects;
-
-    struct ggml_object * objects_begin;
-    struct ggml_object * objects_end;
-
-    struct ggml_scratch scratch;
-    struct ggml_scratch scratch_save;
-};
-
-struct ggml_context_container {
-    bool used;
-
-    struct ggml_context context;
-};
-
-//
-// compute types
-//
-
-enum ggml_task_type {
-    GGML_TASK_INIT = 0,
-    GGML_TASK_COMPUTE,
-    GGML_TASK_FINALIZE,
-};
-
-struct ggml_compute_params {
-    enum ggml_task_type type;
-
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
-//
-// ggml state
-//
-
-struct ggml_state {
-    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
-};
-
-// global state
-static struct ggml_state g_state;
-static atomic_int g_state_barrier = 0;
-
-// barrier via spin lock
-inline static void ggml_critical_section_start(void) {
-    int processing = atomic_fetch_add(&g_state_barrier, 1);
-
-    while (processing > 0) {
-        // wait for other threads to finish
-        atomic_fetch_sub(&g_state_barrier, 1);
-        sched_yield(); // TODO: reconsider this
-        processing = atomic_fetch_add(&g_state_barrier, 1);
-    }
-}
-
-// TODO: make this somehow automatically executed
-//       some sort of "sentry" mechanism
-inline static void ggml_critical_section_end(void) {
-    atomic_fetch_sub(&g_state_barrier, 1);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_print_object(const struct ggml_object * obj) {
-    GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
-            obj->offs, obj->size, (const void *) obj->next);
-}
-
-void ggml_print_objects(const struct ggml_context * ctx) {
-    struct ggml_object * obj = ctx->objects_begin;
-
-    GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
-
-    while (obj != NULL) {
-        ggml_print_object(obj);
-        obj = obj->next;
-    }
-
-    GGML_PRINT("%s: --- end ---\n", __func__);
-}
-
-int64_t ggml_nelements(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-int ggml_nrows(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
-}
-
-size_t ggml_nbytes(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
-}
-
-int ggml_blck_size(enum ggml_type type) {
-    return GGML_BLCK_SIZE[type];
-}
-
-size_t ggml_type_size(enum ggml_type type) {
-    return GGML_TYPE_SIZE[type];
-}
-
-float ggml_type_sizef(enum ggml_type type) {
-    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];
-}
-
-const char * ggml_type_name(enum ggml_type type) {
-    return GGML_TYPE_NAME[type];
-}
-
-
-size_t ggml_element_size(const struct ggml_tensor * tensor) {
-    return GGML_TYPE_SIZE[tensor->type];
-}
-
-static inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_is_vector(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return tensor->ne[2] == 1 && tensor->ne[3] == 1;
-}
-
-static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0])  &&
-        (t0->ne[2] == t1->ne[2])  &&
-        (t0->ne[3] == t1->ne[3]);
-}
-
-bool ggml_is_quantized(enum ggml_type type) {
-    return GGML_IS_QUANTIZED[type];
-}
-
-enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
-    enum ggml_type wtype = GGML_TYPE_COUNT;
-
-    switch (ftype) {
-        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
-        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
-        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
-        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
-        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
-        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
-        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
-        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
-        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
-    }
-
-    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
-
-    return wtype;
-}
-
-static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
-    return tensor->nb[0] > tensor->nb[1];
-}
-
-static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&
-        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
-        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
-}
-
-static inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t0->ne[0] == t1->ne[0] ) &&
-        (t0->ne[1] == t1->ne[1] ) &&
-        (t0->ne[2] == t1->ne[2] ) &&
-        (t0->ne[3] == t1->ne[3] );
-}
-
-// check if t1 can be represented as a repeatition of t0
-static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return
-        (t1->ne[0]%t0->ne[0] == 0) &&
-        (t1->ne[1]%t0->ne[1] == 0) &&
-        (t1->ne[2]%t0->ne[2] == 0) &&
-        (t1->ne[3]%t0->ne[3] == 0);
-}
-
-static inline int ggml_up32(int n) {
-    return (n + 31) & ~31;
-}
-
-static inline int ggml_up64(int n) {
-    return (n + 63) & ~63;
-}
-
-static inline int ggml_up(int n, int m) {
-    // assert m is a power of 2
-    GGML_ASSERT((m & (m - 1)) == 0);
-    return (n + m - 1) & ~(m - 1);
-}
-
-// assert that pointer is aligned to GGML_MEM_ALIGN
-#define ggml_assert_aligned(ptr) \
-    GGML_ASSERT(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_context * ggml_init(struct ggml_init_params params) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    static bool is_first_call = true;
-
-    if (is_first_call) {
-        // initialize time system (required on Windows)
-        ggml_time_init();
-
-        // initialize GELU, SILU and EXP F32 tables
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            ggml_fp16_t ii;
-            for (int i = 0; i < (1 << 16); ++i) {
-                uint16_t ui = i;
-                memcpy(&ii, &ui, sizeof(ii));
-                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);
-                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));
-                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));
-                table_exp_f16[i]  = GGML_FP32_TO_FP16(expf(f));
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-        // initialize g_state
-        {
-            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
-
-            g_state = (struct ggml_state) {
-                /*.contexts =*/ { { 0 } },
-            };
-
-            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
-                g_state.contexts[i].used = false;
-            }
-
-            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
-
-            GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
-        }
-
-#if defined(GGML_USE_CUBLAS)
-        ggml_init_cublas();
-#elif defined(GGML_USE_CLBLAST)
-        ggml_cl_init();
-#endif
-
-        is_first_call = false;
-    }
-
-    // find non-used context in g_state
-    struct ggml_context * ctx = NULL;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (!g_state.contexts[i].used) {
-            g_state.contexts[i].used = true;
-            ctx = &g_state.contexts[i].context;
-
-            GGML_PRINT_DEBUG("%s: found unused context %d\n", __func__, i);
-            break;
-        }
-    }
-
-    if (ctx == NULL) {
-        GGML_PRINT_DEBUG("%s: no unused context found\n", __func__);
-
-        ggml_critical_section_end();
-
-        return NULL;
-    }
-
-    const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
-
-    *ctx = (struct ggml_context) {
-        /*.mem_size           =*/ mem_size,
-        /*.mem_buffer         =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(mem_size),
-        /*.mem_buffer_owned   =*/ params.mem_buffer ? false : true,
-        /*.no_alloc           =*/ params.no_alloc,
-        /*.n_objects          =*/ 0,
-        /*.objects_begin      =*/ NULL,
-        /*.objects_end        =*/ NULL,
-        /*.scratch            =*/ { 0, 0, NULL, },
-        /*.scratch_save       =*/ { 0, 0, NULL, },
-    };
-
-    GGML_ASSERT(ctx->mem_buffer != NULL);
-
-    ggml_assert_aligned(ctx->mem_buffer);
-
-    GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
-
-    ggml_critical_section_end();
-
-    return ctx;
-}
-
-void ggml_free(struct ggml_context * ctx) {
-    // make this function thread safe
-    ggml_critical_section_start();
-
-    bool found = false;
-
-    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {
-        if (&g_state.contexts[i].context == ctx) {
-            g_state.contexts[i].used = false;
-
-            GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
-                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
-
-            if (ctx->mem_buffer_owned) {
-                GGML_ALIGNED_FREE(ctx->mem_buffer);
-            }
-
-            found = true;
-            break;
-        }
-    }
-
-    if (!found) {
-        GGML_PRINT_DEBUG("%s: context not found\n", __func__);
-    }
-
-    ggml_critical_section_end();
-}
-
-size_t ggml_used_mem(const struct ggml_context * ctx) {
-    return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
-}
-
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
-    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
-
-    ctx->scratch = scratch;
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-struct ggml_tensor * ggml_new_tensor_impl(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t* ne,
-        void*  data) {
-    // always insert objects at the end of the context's memory pool
-    struct ggml_object * obj_cur = ctx->objects_end;
-
-    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;
-    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
-    const size_t cur_end  = cur_offs + cur_size;
-
-    size_t size_needed = 0;
-
-    if (data == NULL && !ctx->no_alloc) {
-        size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
-        for (int i = 1; i < n_dims; i++) {
-            size_needed *= ne[i];
-        }
-        // align to GGML_MEM_ALIGN
-        size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
-    }
-
-    char * const mem_buffer = ctx->mem_buffer;
-    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
-
-    if (ctx->scratch.data == NULL || data != NULL) {
-        size_needed += sizeof(struct ggml_tensor);
-
-        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
-            assert(false);
-            return NULL;
-        }
-
-        *obj_new = (struct ggml_object) {
-            .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = size_needed,
-            .next = NULL,
-        };
-    } else {
-        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
-            GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
-            assert(false);
-            return NULL;
-        }
-
-        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {
-            GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
-                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);
-            assert(false);
-            return NULL;
-        }
-
-        data = (char * const) ctx->scratch.data + ctx->scratch.offs;
-
-        *obj_new = (struct ggml_object) {
-            .offs = cur_end + GGML_OBJECT_SIZE,
-            .size = sizeof(struct ggml_tensor),
-            .next = NULL,
-        };
-
-        //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
-
-        ctx->scratch.offs += size_needed;
-    }
-
-    if (obj_cur != NULL) {
-        obj_cur->next = obj_new;
-    } else {
-        // this is the first object in this context
-        ctx->objects_begin = obj_new;
-    }
-
-    ctx->objects_end = obj_new;
-
-    //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
-
-    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
-
-    ggml_assert_aligned(result);
-
-    *result = (struct ggml_tensor) {
-        /*.type         =*/ type,
-        /*.n_dims       =*/ n_dims,
-        /*.ne           =*/ { 1, 1, 1, 1 },
-        /*.nb           =*/ { 0, 0, 0, 0 },
-        /*.op           =*/ GGML_OP_NONE,
-        /*.is_param     =*/ false,
-        /*.grad         =*/ NULL,
-        /*.src0         =*/ NULL,
-        /*.src1         =*/ NULL,
-        /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-        /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
-        /*.name         =*/ { 0 },
-        /*.pad          =*/ { 0 },
-    };
-
-    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
-    //ggml_assert_aligned(result->data);
-
-    for (int i = 0; i < n_dims; i++) {
-        result->ne[i] = ne[i];
-    }
-
-    result->nb[0] = GGML_TYPE_SIZE[type];
-    result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
-    for (int i = 2; i < GGML_MAX_DIMS; i++) {
-        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
-    }
-
-    ctx->n_objects++;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_tensor(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int    n_dims,
-        const int64_t * ne) {
-    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
-}
-
-struct ggml_tensor * ggml_new_tensor_1d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0) {
-    return ggml_new_tensor(ctx, type, 1, &ne0);
-}
-
-struct ggml_tensor * ggml_new_tensor_2d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1) {
-    const int64_t ne[2] = { ne0, ne1 };
-    return ggml_new_tensor(ctx, type, 2, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_3d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2) {
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    return ggml_new_tensor(ctx, type, 3, ne);
-}
-
-struct ggml_tensor * ggml_new_tensor_4d(
-        struct ggml_context * ctx,
-        enum   ggml_type type,
-        int64_t ne0,
-        int64_t ne1,
-        int64_t ne2,
-        int64_t ne3) {
-    const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
-    return ggml_new_tensor(ctx, type, 4, ne);
-}
-
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
-
-    ctx->scratch = ctx->scratch_save;
-
-    ggml_set_i32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
-    ctx->scratch_save = ctx->scratch;
-    ctx->scratch.data = NULL;
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-
-    ctx->scratch = ctx->scratch_save;
-
-    ggml_set_f32(result, value);
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {
-    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);
-}
-
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
-    memset(tensor->data, 0, ggml_nbytes(tensor));
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {
-    const int n     = ggml_nrows(tensor);
-    const int nc    = tensor->ne[0];
-    const size_t n1 = tensor->nb[1];
-
-    char * const data = tensor->data;
-
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                assert(tensor->nb[0] == sizeof(int8_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I16:
-            {
-                assert(tensor->nb[0] == sizeof(int16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_I32:
-            {
-                assert(tensor->nb[0] == sizeof(int32_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F16:
-            {
-                assert(tensor->nb[0] == sizeof(ggml_fp16_t));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);
-                }
-            } break;
-        case GGML_TYPE_F32:
-            {
-                assert(tensor->nb[0] == sizeof(float));
-                for (int i = 0; i < n; i++) {
-                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
-                }
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return tensor;
-}
-
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                return ((int8_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                return ((int16_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                return ((int32_t *)(tensor->data))[i];
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                return ((float *)(tensor->data))[i];
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    return 0.0f;
-}
-
-void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {
-    switch (tensor->type) {
-        case GGML_TYPE_I8:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));
-                ((int8_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));
-                ((int16_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_I32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));
-                ((int32_t *)(tensor->data))[i] = value;
-            } break;
-        case GGML_TYPE_F16:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));
-                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(tensor->nb[0] == sizeof(float));
-                ((float *)(tensor->data))[i] = value;
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-void * ggml_get_data(const struct ggml_tensor * tensor) {
-    return tensor->data;
-}
-
-float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
-    assert(tensor->type == GGML_TYPE_F32);
-    return (float *)(tensor->data);
-}
-
-const char * ggml_get_name(const struct ggml_tensor * tensor) {
-    return tensor->name;
-}
-
-void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
-    strncpy(tensor->name, name, sizeof(tensor->name));
-    tensor->name[sizeof(tensor->name) - 1] = '\0';
-}
-
-struct ggml_tensor * ggml_view_tensor(
-        struct ggml_context * ctx,
-        const struct ggml_tensor * src) {
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);
-
-    result->nb[0] = src->nb[0];
-    result->nb[1] = src->nb[1];
-    result->nb[2] = src->nb[2];
-    result->nb[3] = src->nb[3];
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// ggml_dup
-
-struct ggml_tensor * ggml_dup_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DUP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_dup(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_dup_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_dup_impl(ctx, a, true);
-}
-
-// ggml_add
-
-struct ggml_tensor * ggml_add_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ADD;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_add(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_add_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_add_impl(ctx, a, b, true);
-}
-
-// ggml_sub
-
-struct ggml_tensor * ggml_sub_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SUB;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sub(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_sub_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_sub_impl(ctx, a, b, true);
-}
-
-// ggml_mul
-
-struct ggml_tensor * ggml_mul_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(is_node == false);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_MUL;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_mul(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_mul_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_mul_impl(ctx, a, b, true);
-}
-
-// ggml_div
-
-struct ggml_tensor * ggml_div_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    if (inplace) {
-        GGML_ASSERT(is_node == false);
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_DIV;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_div(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_div_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    return ggml_div_impl(ctx, a, b, true);
-}
-
-// ggml_sqr
-
-struct ggml_tensor * ggml_sqr_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQR;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqr(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqr_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqr_impl(ctx, a, true);
-}
-
-// ggml_sqrt
-
-struct ggml_tensor * ggml_sqrt_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SQRT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sqrt(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sqrt_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sqrt_impl(ctx, a, true);
-}
-
-// ggml_sum
-
-struct ggml_tensor * ggml_sum(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);
-
-    result->op   = GGML_OP_SUM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_mean
-
-struct ggml_tensor * ggml_mean(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement
-        is_node = true;
-    }
-
-    int64_t ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);
-
-    result->op   = GGML_OP_MEAN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_repeat
-
-struct ggml_tensor * ggml_repeat(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_can_repeat(a, b));
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true;
-    }
-
-    if (ggml_are_same_shape(a, b) && !is_node) {
-        return a;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
-
-    result->op   = GGML_OP_REPEAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_abs
-
-struct ggml_tensor * ggml_abs_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_ABS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_abs(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_abs_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_abs_impl(ctx, a, true);
-}
-
-
-// ggml_sgn
-
-struct ggml_tensor * ggml_sgn_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SGN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_sgn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_sgn_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_sgn_impl(ctx, a, true);
-}
-
-// ggml_neg
-
-struct ggml_tensor * ggml_neg_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_NEG;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_neg(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_neg_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_neg_impl(ctx, a, true);
-}
-
-// ggml_step
-
-struct ggml_tensor * ggml_step_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_STEP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_step(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_step_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_step_impl(ctx, a, true);
-}
-
-// ggml_relu
-
-struct ggml_tensor * ggml_relu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_RELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_relu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_relu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_relu_impl(ctx, a, true);
-}
-
-// ggml_gelu
-
-struct ggml_tensor * ggml_gelu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_GELU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_gelu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_gelu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_gelu_impl(ctx, a, true);
-}
-
-// ggml_silu
-
-struct ggml_tensor * ggml_silu_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SILU;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_silu_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_silu_impl(ctx, a, true);
-}
-
-// ggml_norm
-
-struct ggml_tensor * ggml_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
-
-    return result;
-}
-
-struct ggml_tensor * ggml_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_norm_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_norm_impl(ctx, a, true);
-}
-
-struct ggml_tensor * ggml_rms_norm_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_RMS_NORM;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store epsilon here?
-
-    return result;
-}
-
-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_rms_norm_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_rms_norm_impl(ctx, a, true);
-}
-
-// ggml_mul_mat
-
-struct ggml_tensor * ggml_mul_mat(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_can_mul_mat(a, b));
-    GGML_ASSERT(!ggml_is_transposed(a));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);
-
-    result->op   = GGML_OP_MUL_MAT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_scale
-
-struct ggml_tensor * ggml_scale_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_is_scalar(b));
-    GGML_ASSERT(ggml_is_padded_1d(a));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->op   = GGML_OP_SCALE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_scale(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_scale_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_scale_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_scale_impl(ctx, a, b, true);
-}
-
-// ggml_cpy
-
-struct ggml_tensor * ggml_cpy_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
-
-    result->op   = GGML_OP_CPY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cpy(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, false);
-}
-
-struct ggml_tensor * ggml_cpy_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, true);
-}
-
-// ggml_cont
-
-struct ggml_tensor * ggml_cont_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_CONT;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_cont(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_cont_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a) {
-    return ggml_cont_impl(ctx, a, true);
-}
-
-// ggml_reshape
-
-struct ggml_tensor * ggml_reshape(
-        struct ggml_context * ctx,
-        struct ggml_tensor * a,
-        struct ggml_tensor * b) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_is_contiguous(b));
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[2] = { ne0, ne1 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_reshape_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2) {
-    GGML_ASSERT(ggml_is_contiguous(a));
-    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[3] = { ne0, ne1, ne2 };
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);
-
-    result->op   = GGML_OP_RESHAPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_view_1d
-
-struct ggml_tensor * ggml_view_1d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_view_2d
-
-struct ggml_tensor * ggml_view_2d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        size_t                nb1,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = result->nb[1]*ne1;
-    result->nb[3] = result->nb[2];
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_view_3d
-
-struct ggml_tensor * ggml_view_3d(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        size_t                nb1,
-        size_t                nb2,
-        size_t                offset) {
-    if (a->grad) {
-        GGML_ASSERT(false); // gradient propagation is not supported
-    }
-
-    const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
-
-    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
-
-    result->nb[1] = nb1;
-    result->nb[2] = nb2;
-    result->nb[3] = result->nb[2]*ne2;
-
-    result->op   = GGML_OP_VIEW;
-    result->grad = NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the offset here?
-
-    return result;
-}
-
-// ggml_permute
-
-struct ggml_tensor * ggml_permute(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   axis0,
-        int                   axis1,
-        int                   axis2,
-        int                   axis3) {
-    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);
-    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);
-
-    GGML_ASSERT(axis0 != axis1);
-    GGML_ASSERT(axis0 != axis2);
-    GGML_ASSERT(axis0 != axis3);
-    GGML_ASSERT(axis1 != axis2);
-    GGML_ASSERT(axis1 != axis3);
-    GGML_ASSERT(axis2 != axis3);
-
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    int ne[GGML_MAX_DIMS];
-    int nb[GGML_MAX_DIMS];
-
-    ne[axis0] = a->ne[0];
-    ne[axis1] = a->ne[1];
-    ne[axis2] = a->ne[2];
-    ne[axis3] = a->ne[3];
-
-    nb[axis0] = a->nb[0];
-    nb[axis1] = a->nb[1];
-    nb[axis2] = a->nb[2];
-    nb[axis3] = a->nb[3];
-
-    result->ne[0] = ne[0];
-    result->ne[1] = ne[1];
-    result->ne[2] = ne[2];
-    result->ne[3] = ne[3];
-
-    result->nb[0] = nb[0];
-    result->nb[1] = nb[1];
-    result->nb[2] = nb[2];
-    result->nb[3] = nb[3];
-
-    result->op   = GGML_OP_PERMUTE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL; // TODO: maybe store the permutation here?
-
-    return result;
-}
-
-// ggml_transpose
-
-struct ggml_tensor * ggml_transpose(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->ne[0] = a->ne[1];
-    result->ne[1] = a->ne[0];
-
-    result->nb[0] = a->nb[1];
-    result->nb[1] = a->nb[0];
-
-    result->op   = GGML_OP_TRANSPOSE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_get_rows
-
-struct ggml_tensor * ggml_get_rows(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);
-
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: implement non F32 return
-    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);
-
-    result->op   = GGML_OP_GET_ROWS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_diag_mask_inf
-
-struct ggml_tensor * ggml_diag_mask_inf(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
-    ggml_set_name(b, "n_past");
-
-    result->op   = GGML_OP_DIAG_MASK_INF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_soft_max
-
-struct ggml_tensor * ggml_soft_max(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    result->op   = GGML_OP_SOFT_MAX;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = NULL;
-
-    return result;
-}
-
-// ggml_rope
-
-struct ggml_tensor * ggml_rope(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_dims,
-        int                   mode) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_dims;
-    ((int32_t *) b->data)[2] = mode;
-    ggml_set_name(b, "n_past, n_dims, mode");
-
-    result->op   = GGML_OP_ROPE;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_alibi
-
-struct ggml_tensor * ggml_alibi(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        int                   n_past,
-        int                   n_head) {
-    GGML_ASSERT(n_past >= 0);
-    bool is_node = false;
-
-    if (a->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    // TODO: when implement backward, fix this:
-    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
-    ((int32_t *) b->data)[0] = n_past;
-    ((int32_t *) b->data)[1] = n_head;
-
-    result->op   = GGML_OP_ALIBI;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_conv_1d_1s
-
-struct ggml_tensor * ggml_conv_1d_1s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    result->op   = GGML_OP_CONV_1D_1S;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_conv_1d_2s
-
-struct ggml_tensor * ggml_conv_1d_2s(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b) {
-    GGML_ASSERT(ggml_is_matrix(b));
-    GGML_ASSERT(a->ne[1] == b->ne[1]);
-    GGML_ASSERT(a->ne[3] == 1);
-    bool is_node = false;
-
-    if (a->grad || b->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);
-
-    result->op   = GGML_OP_CONV_1D_2S;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-
-    return result;
-}
-
-// ggml_flash_attn
-
-struct ggml_tensor * ggml_flash_attn(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * q,
-        struct ggml_tensor  * k,
-        struct ggml_tensor  * v,
-        bool                  masked) {
-    GGML_ASSERT(ggml_can_mul_mat(k, q));
-    // TODO: check if vT can be multiplied by (k*qT)
-
-    bool is_node = false;
-
-    if (q->grad || k->grad || v->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);
-
-    result->op   = GGML_OP_FLASH_ATTN;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = q;
-    result->src1 = k;
-    result->opt[0] = v;
-    result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);
-
-    return result;
-}
-
-// ggml_flash_ff
-
-struct ggml_tensor * ggml_flash_ff(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b0,
-        struct ggml_tensor  * b1,
-        struct ggml_tensor  * c0,
-        struct ggml_tensor  * c1) {
-    GGML_ASSERT(ggml_can_mul_mat(b0, a));
-    // TODO: more checks
-
-    bool is_node = false;
-
-    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {
-        GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-
-    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
-    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);
-
-    result->op   = GGML_OP_FLASH_FF;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b0;
-    result->opt[0] = b1;
-    result->opt[1] = c0;
-    result->opt[2] = c1;
-
-    return result;
-}
-
-// ggml_map_unary
-
-struct ggml_tensor * ggml_map_unary_impl_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun,
-        bool   inplace) {
-    bool is_node = false;
-
-    if (!inplace && a->grad) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
-    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op = GGML_OP_MAP_UNARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->opt[0] = addr_tensor;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_unary_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, false);
-}
-
-struct ggml_tensor * ggml_map_unary_inplace_f32(
-        struct ggml_context        * ctx,
-        struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun) {
-    return ggml_map_unary_impl_f32(ctx, a, fun, true);
-}
-
-// ggml_map_binary
-
-struct ggml_tensor * ggml_map_binary_impl_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun,
-        bool   inplace) {
-    GGML_ASSERT(ggml_are_same_shape(a, b));
-
-    bool is_node = false;
-
-    if (!inplace && (a->grad || b->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t));
-    *((void (**)(void))addr_tensor->data) = (void (*)(void))fun;
-    struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op = GGML_OP_MAP_BINARY;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src0 = a;
-    result->src1 = b;
-    result->opt[0] = addr_tensor;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_map_binary_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
-}
-
-struct ggml_tensor * ggml_map_binary_inplace_f32(
-        struct ggml_context         * ctx,
-        struct ggml_tensor          * a,
-        struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun) {
-    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-void ggml_set_param(
-        struct ggml_context * ctx,
-        struct ggml_tensor * tensor) {
-    tensor->is_param = true;
-
-    GGML_ASSERT(tensor->grad == NULL);
-    tensor->grad = ggml_dup_tensor(ctx, tensor);
-}
-
-// ggml_compute_forward_dup
-
-static void ggml_compute_forward_dup_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
-        const int ne = ggml_nelements(dst);
-        const int dr = (ne + nth - 1) / nth;
-        const int ie0 = dr * ith;
-        const int ie1 = MIN(ie0 + dr, ne);
-
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
-
-    if (ggml_is_contiguous(dst)) {
-        if (nb00 == sizeof(ggml_fp16_t)) {
-            if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                dst_ptr[id] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
-                float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                src0_f32[i00] = GGML_FP16_TO_FP32(src0_ptr[i00]);
-                            }
-
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-        return;
-    }
-
-    // dst counters
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(ggml_fp16_t));
-
-                        if (++i10 == ne00) {
-                            i10 = 0;
-                            if (++i11 == ne01) {
-                                i11 = 0;
-                                if (++i12 == ne02) {
-                                    i12 = 0;
-                                    if (++i13 == ne03) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(float *) dst_ptr = GGML_FP16_TO_FP32(*(const ggml_fp16_t *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const int ith = params->ith; // thread index
-    const int nth = params->nth; // number of threads
-
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
-        // parallelize by elements
-        const int ne = ggml_nelements(dst);
-        const int dr = (ne + nth - 1) / nth;
-        const int ie0 = dr * ith;
-        const int ie1 = MIN(ie0 + dr, ne);
-
-        memcpy(
-            ((char *)  dst->data + ie0*nb0),
-            ((char *) src0->data + ie0*nb00),
-            (ie1 - ie0) * GGML_TYPE_SIZE[src0->type]);
-
-        return;
-    }
-
-    // parallelize by rows
-    const int nr = ne01;
-    // number of rows per thread
-    const int dr = (nr + nth - 1) / nth;
-    // row range for this thread
-    const int ir0 = dr * ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (src0->type == dst->type &&
-        ne00 == ne0 &&
-        nb00 == GGML_TYPE_SIZE[src0->type] && nb0 == GGML_TYPE_SIZE[dst->type]) {
-        // copy by rows
-        const size_t rs = ne00*nb00;
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    memcpy(
-                        ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
-                        rs);
-                }
-            }
-        }
-        return;
-    }
-
-    if (ggml_is_contiguous(dst)) {
-        // TODO: simplify
-        if (nb00 == sizeof(float)) {
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                const size_t rs = ne00 * nb00;
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
-                            memcpy(dst_ptr + id, src0_ptr, rs);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (ggml_is_quantized(dst->type)) {
-                quantize_row_q_t const quantize_row_q = quantize_fns[dst->type].quantize_row_q;
-
-                size_t id = 0;
-                size_t rs = nb0 * (ne00 / GGML_BLCK_SIZE[dst->type]);
-                char * dst_ptr = (char *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += rs * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
-                            id += rs;
-                        }
-                        id += rs * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
-
-            if (dst->type == GGML_TYPE_F32) {
-                size_t id = 0;
-                float * dst_ptr = (float *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = *src0_ptr;
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else if (dst->type == GGML_TYPE_F16) {
-                size_t id = 0;
-                ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
-
-                for (int i03 = 0; i03 < ne03; i03++) {
-                    for (int i02 = 0; i02 < ne02; i02++) {
-                        id += ne00 * ir0;
-                        for (int i01 = ir0; i01 < ir1; i01++) {
-                            for (int i00 = 0; i00 < ne00; i00++) {
-                                const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-
-                                dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
-                                id++;
-                            }
-                        }
-                        id += ne00 * (ne01 - ir1);
-                    }
-                }
-            } else {
-                GGML_ASSERT(false); // TODO: implement
-            }
-        }
-
-        return;
-    }
-
-    // dst counters
-
-    int64_t i10 = 0;
-    int64_t i11 = 0;
-    int64_t i12 = 0;
-    int64_t i13 = 0;
-
-    if (dst->type == GGML_TYPE_F32) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        memcpy(dst_ptr, src0_ptr, sizeof(float));
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else if (dst->type == GGML_TYPE_F16) {
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                i10 += ne00 * ir0;
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-                for (int64_t i01 = ir0; i01 < ir1; i01++) {
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              char * dst_ptr  = ((char *)  dst->data + i10*nb0  + i11*nb1  + i12*nb2  + i13*nb3);
-
-                        *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(*(const float *) src0_ptr);
-
-                        if (++i10 == ne0) {
-                            i10 = 0;
-                            if (++i11 == ne1) {
-                                i11 = 0;
-                                if (++i12 == ne2) {
-                                    i12 = 0;
-                                    if (++i13 == ne3) {
-                                        i13 = 0;
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-                i10 += ne00 * (ne01 - ir1);
-                while (i10 >= ne0) {
-                    i10 -= ne0;
-                    if (++i11 == ne1) {
-                        i11 = 0;
-                        if (++i12 == ne2) {
-                            i12 = 0;
-                            if (++i13 == ne3) {
-                                i13 = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ASSERT(false); // TODO: implement
-    }
-}
-
-static void ggml_compute_forward_dup(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_dup_f16(params, src0, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_dup_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_add
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
-#ifdef GGML_USE_ACCELERATE
-            vDSP_vadd(
-                    (float *) ((char *) src0->data + j*nb01), 1,
-                    (float *) ((char *) src1->data + j*nb11), 1,
-                    (float *) ((char *) dst->data  + j*nb1),  1, nc);
-#else
-            ggml_vec_add_f32(nc,
-                    (float *) ((char *) dst->data  + j*nb1),
-                    (float *) ((char *) src0->data + j*nb01),
-                    (float *) ((char *) src1->data + j*nb11));
-#endif
-        }
-    } else {
-        // src1 is not contiguous
-        for (int j = ith; j < n; j += nth) {
-            float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);
-            float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-
-                dst_ptr[i] = src0_ptr[i] + *src1_ptr;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_add_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    if (nb10 == sizeof(float)) {
-        for (int j = ith; j < n; j += nth) {
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + *src1_ptr);
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_f16_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    const size_t nb00 = src0->nb[0];
-    const size_t nb01 = src0->nb[1];
-
-    const size_t nb10 = src1->nb[0];
-    const size_t nb11 = src1->nb[1];
-
-    const size_t nb0 = dst->nb[0];
-    const size_t nb1 = dst->nb[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F16);
-    GGML_ASSERT(dst->type == GGML_TYPE_F16);
-
-    GGML_ASSERT( nb0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    if (nb10 == sizeof(ggml_fp16_t)) {
-        for (int j = ith; j < n; j += nth) {
-            ggml_fp16_t * dst_ptr  = (ggml_fp16_t *) ((char *) dst->data  + j*nb1);
-            ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + j*nb01);
-            for (int i = 0; i < nc; i++) {
-                ggml_fp16_t * src1_ptr = (ggml_fp16_t *) ((char *) src1->data + j*nb11 + i*nb10);
-                dst_ptr[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(src0_ptr[i]) + GGML_FP16_TO_FP32(*src1_ptr));
-            }
-        }
-    }
-    else {
-        // src1 is not contiguous
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_compute_forward_add_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    //const int64_t ne10 = src1->ne[0];
-    //const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-    quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ggml_is_quantized(src0->type));
-    GGML_ASSERT(dst->type == src0->type);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    float * wdata = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
-
-    assert(ne01 >= 1);
-    assert(ne02*ne01 >= 1);
-    struct magicu ne01m = __magicu_get(ne01);
-    struct magicu ne021m = __magicu_get(ne02*ne01);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = __magicu_div(ir, ne021m);
-        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        // src1 and dst are same shape as src0 => same indices
-        const int i13 = i03;
-        const int i12 = i02;
-        const int i11 = i01;
-
-        const int i3 = i03;
-        const int i2 = i02;
-        const int i1 = i01;
-
-        void  * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13));
-        void  * dst_row  = (void *) ((char *)  dst->data + ( i1*nb1  +  i2*nb2  +  i3*nb0));
-
-        assert(ne00 % 32 == 0);
-
-        // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
-        // add src1
-        ggml_vec_acc_f32(ne00, wdata, src1_row);
-        // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne00);
-    }
-}
-
-static void ggml_compute_forward_add(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_add_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                if (src1->type == GGML_TYPE_F16) {
-                    ggml_compute_forward_add_f16_f16(params, src0, src1, dst);
-                }
-                else if (src1->type == GGML_TYPE_F32) {
-                    ggml_compute_forward_add_f16_f32(params, src0, src1, dst);
-                }
-                else {
-                    GGML_ASSERT(false);
-                }
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            {
-                ggml_compute_forward_add_q_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sub
-
-static void ggml_compute_forward_sub_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sub_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sub(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sub_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mul
-
-static void ggml_compute_forward_mul_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_mul_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_mul(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_div
-
-static void ggml_compute_forward_div_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_div_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_div(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_div_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqr
-
-static void ggml_compute_forward_sqr_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n     = ggml_nrows(src0);
-    const int nc    = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqr_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqr(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqr_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sqrt
-
-static void ggml_compute_forward_sqrt_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sqrt_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sqrt(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sqrt_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sum
-
-static void ggml_compute_forward_sum_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_is_scalar(dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(ggml_is_scalar(dst));
-    assert(src0->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    ggml_float sum     = 0;
-    ggml_float row_sum = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_ggf(ne00,
-                        &row_sum,
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-                sum += row_sum;
-            }
-        }
-    }
-    ((float *) dst->data)[0] = sum;
-}
-
-static void ggml_compute_forward_sum(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sum_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_mean
-
-static void ggml_compute_forward_mean_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    assert(src0->nb[0] == sizeof(float));
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t ne3 = dst->ne[3];
-
-    assert(ne0 == 1);
-    assert(ne1 == ne01);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    UNUSED(ne0);
-    UNUSED(ne1);
-    UNUSED(ne2);
-    UNUSED(ne3);
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = 0; i01 < ne01; i01++) {
-                ggml_vec_sum_f32(ne00,
-                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),
-                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
-
-                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_mean(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mean_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_repeat
-
-static void ggml_compute_forward_repeat_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_can_repeat(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: implement support for rank > 2 tensors
-    assert(src0->ne[2] == 1);
-    assert(src0->ne[3] == 1);
-    assert( dst->ne[2] == 1);
-    assert( dst->ne[3] == 1);
-
-    const int nc  = dst->ne[0];
-    const int nr  = dst->ne[1];
-    const int nc0 = src0->ne[0];
-    const int nr0 = src0->ne[1];
-    const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat
-    const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat
-
-    // TODO: support for transposed / permuted tensors
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    // TODO: maybe this is not optimal?
-    for (int i = 0; i < nrr; i++) {
-        for (int j = 0; j < ncr; j++) {
-            for (int k = 0; k < nr0; k++) {
-                ggml_vec_cpy_f32(nc0,
-                        (float *) ((char *)  dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])),
-                        (float *) ((char *) src0->data + (        k)*(src0->nb[1])));
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_repeat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_repeat_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_abs
-
-static void ggml_compute_forward_abs_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_abs_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_abs(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_abs_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_sgn
-
-static void ggml_compute_forward_sgn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_sgn_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_sgn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_sgn_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_neg
-
-static void ggml_compute_forward_neg_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_neg_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_neg(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_neg_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_step
-
-static void ggml_compute_forward_step_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_step_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_step(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_step_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_relu
-
-static void ggml_compute_forward_relu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_relu_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_relu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_relu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_gelu
-
-static void ggml_compute_forward_gelu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_gelu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_gelu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_gelu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //printf("XXXXXXXX gelu\n");
-}
-
-// ggml_compute_forward_silu
-
-static void ggml_compute_forward_silu_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_silu_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-
-#ifndef NDEBUG
-        for (int k = 0; k < nc; k++) {
-            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
-            UNUSED(x);
-            assert(!isnan(x));
-            assert(!isinf(x));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_silu(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_silu_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_norm
-
-static void ggml_compute_forward_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const float eps = 1e-5f; // TODO: make this a parameter
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)x[i00];
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                ggml_float sum2 = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    float v = x[i00] - mean;
-                    y[i00] = v;
-                    sum2 += (ggml_float)(v*v);
-                }
-
-                float variance = sum2/ne00;
-                const float scale = 1.0f/sqrtf(variance + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_rms_norm_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-    const size_t nb03 = src0->nb[3];
-
-    const size_t nb1 = dst->nb[1];
-    const size_t nb2 = dst->nb[2];
-    const size_t nb3 = dst->nb[3];
-
-    const float eps = 1e-6f; // TODO: make this a parameter
-
-    // TODO: optimize
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-
-                ggml_float sum = 0.0;
-                for (int64_t i00 = 0; i00 < ne00; i00++) {
-                    sum += (ggml_float)(x[i00] * x[i00]);
-                }
-
-                float mean = sum/ne00;
-
-                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
-
-                memcpy(y, x, ne00 * sizeof(float));
-                // for (int i00 = 0; i00 < ne00; i00++) {
-                //     y[i00] = x[i00];
-                // }
-
-                const float scale = 1.0f/sqrtf(mean + eps);
-
-                ggml_vec_scale_f32(ne00, y, scale);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rms_norm(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rms_norm_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
-// ggml_compute_forward_mul_mat
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    //const int64_t ne00 = src0->ne[0];
-    //const int64_t ne01 = src0->ne[1];
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) &&
-        ggml_is_contiguous(src1) &&
-        (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
-        /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
-        return true;
-    }
-
-    return false;
-}
-#endif
-
-static void ggml_compute_forward_mul_mat_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-    const int64_t ne10 = src1->ne[0];
-#endif
-    const int64_t ne11 = src1->ne[1];
-#ifndef NDEBUG
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-#endif
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-#ifndef NDEBUG
-    const int nb10 = src1->nb[0];
-#endif
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    assert(ne02 == ne12);
-    assert(ne03 == ne13);
-    assert(ne2  == ne12);
-    assert(ne3  == ne13);
-
-    // we don't support permuted src0 or src1
-    assert(nb00 == sizeof(float));
-    assert(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    assert(nb0 == sizeof(float));
-    assert(nb0 <= nb1);
-    assert(nb1 <= nb2);
-    assert(nb2 <= nb3);
-
-    assert(ne0 == ne01);
-    assert(ne1 == ne11);
-    assert(ne2 == ne02);
-    assert(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-        //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by src0 rows using ggml_vec_dot_f32
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    assert(ne01 >= 1);
-    assert(ne02*ne01 >= 1);
-    struct magicu ne01m = __magicu_get(ne01);
-    struct magicu ne021m = __magicu_get(ne02*ne01);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = __magicu_div(ir, ne021m);
-        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            // src1 indices
-            const int i13 = i03;
-            const int i12 = i02;
-            const int i11 = ic;
-
-            // dst indices
-            const int i0 = i01;
-            const int i1 = i11;
-            const int i2 = i02;
-            const int i3 = i03;
-
-            ggml_vec_dot_f32(ne00,
-                    (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),
-                    (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
-        }
-    }
-
-    //int64_t t1 = ggml_perf_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-    //    printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    // TODO: we don't support permuted src0
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        GGML_ASSERT(nb10 == sizeof(float));
-
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                float * const wdata = params->wdata;
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        for (int64_t i00 = 0; i00 < ne00; ++i00) {
-                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
-                        }
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
-#if defined(GGML_USE_CLBLAST)
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        GGML_TYPE_F32);
-#else
-                const float * x = wdata;
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-                // zT = y * xT
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-
-        /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        ggml_fp16_t * const wdata = params->wdata;
-
-        size_t id = 0;
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    for (int64_t i10 = 0; i10 < ne10; ++i10) {
-                        wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
-                    }
-                }
-            }
-        }
-
-        GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // fp16 -> half the size, so divide by 2
-    // TODO: do not support transposed src1
-    assert(nb10/2 == sizeof(ggml_fp16_t));
-
-    // parallelize by src0 rows using ggml_vec_dot_f16
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_fp16_t * wdata = params->wdata;
-
-    assert(ne01 >= 1);
-    assert(ne02*ne01 >= 1);
-    struct magicu ne01m = __magicu_get(ne01);
-    struct magicu ne021m = __magicu_get(ne02*ne01);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = __magicu_div(ir, ne021m);
-        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int i13 = i03;
-        const int i12 = i02;
-
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
-
-        ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;
-
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
-        }
-    }
-
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat_q_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    const int64_t ne2  = dst->ne[2];
-    const int64_t ne3  = dst->ne[3];
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    GGML_ASSERT(ne02 == ne12);
-    GGML_ASSERT(ne03 == ne13);
-    GGML_ASSERT(ne2  == ne12);
-    GGML_ASSERT(ne3  == ne13);
-
-    const enum ggml_type type = src0->type;
-    quantize_row_q_t const quantize_row_q_dot = quantize_fns[type].quantize_row_q_dot;
-    vec_dot_q_t      const vec_dot_q          = quantize_fns[type].vec_dot_q;
-    enum ggml_type   const vec_dot_type       = quantize_fns[type].vec_dot_type;
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[type]);
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne02);
-    GGML_ASSERT(ne3 == ne03);
-
-    // nb01 >= nb00 - src0 is not transposed
-    //   compute by src0 rows
-
-#if defined(GGML_USE_CUBLAS)
-    if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
-            ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
-        if (params->ith != 0) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_INIT) {
-            return;
-        }
-
-        if (params->type == GGML_TASK_FINALIZE) {
-            return;
-        }
-
-        float * const wdata = params->wdata;
-        dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
-
-                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-
-#if defined(GGML_USE_CLBLAST)
-                const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
-#else
-                {
-                    size_t id = 0;
-                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
-                        dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
-                        id += ne00;
-                    }
-
-                    assert(id*sizeof(float) <= params->wsize);
-                }
-
-                const float * x = wdata;
-#endif
-
-#if defined(GGML_USE_CLBLAST)
-                // zT = y * xT
-                ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne10,
-                        0.0f,    d, ne01,
-                        type);
-#else
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
-#endif
-            }
-        }
-
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
-        return;
-    }
-#endif
-
-    if (params->type == GGML_TASK_INIT) {
-        char * wdata = params->wdata;
-        const size_t row_size = ne10*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    quantize_row_q_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
-                    wdata += row_size;
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by src0 rows using ggml_vec_dot_q
-
-    // total rows in src0
-    const int nr = ne01*ne02*ne03;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    void * wdata = params->wdata;
-    const size_t row_size = ne00*GGML_TYPE_SIZE[vec_dot_type]/GGML_BLCK_SIZE[vec_dot_type];
-
-    assert(ne01 >= 1);
-    assert(ne02*ne01 >= 1);
-    struct magicu ne01m = __magicu_get(ne01);
-    struct magicu ne021m = __magicu_get(ne02*ne01);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // src0 indices
-        const int i03 = __magicu_div(ir, ne021m);
-        const int i02 = __magicu_div(ir - i03*ne02*ne01, ne01m);
-        const int i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int i13 = i03;
-        const int i12 = i02;
-
-        const int i0 = i01;
-        const int i2 = i02;
-        const int i3 = i03;
-
-        void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));
-        char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*row_size));
-
-        float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
-
-        assert(ne00 % 32 == 0);
-
-        for (int64_t ic = 0; ic < ne11; ++ic) {
-            vec_dot_q(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
-        }
-    }
-
-    //int64_t t1 = ggml_time_us();
-    //static int64_t acc = 0;
-    //acc += t1 - t0;
-    //if (t1 - t0 > 10) {
-    //    printf("\n");
-    //    printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03);
-    //    printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03);
-    //    printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13);
-
-    //    printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc);
-    //}
-}
-
-static void ggml_compute_forward_mul_mat(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-            {
-                ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_scale
-
-static void ggml_compute_forward_scale_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-    GGML_ASSERT(ggml_is_scalar(src1));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // scale factor
-    const float v = *(float *) src1->data;
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);
-    }
-}
-
-static void ggml_compute_forward_scale(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_scale_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_cpy
-
-static void ggml_compute_forward_cpy(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_cont
-
-static void ggml_compute_forward_cont(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    ggml_compute_forward_dup(params, src0, dst);
-}
-
-// ggml_compute_forward_reshape
-
-static void ggml_compute_forward_reshape(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-    UNUSED(dst);
-}
-
-// ggml_compute_forward_view
-
-static void ggml_compute_forward_view(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_permute
-
-static void ggml_compute_forward_permute(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_transpose
-
-static void ggml_compute_forward_transpose(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0) {
-    // NOP
-    UNUSED(params);
-    UNUSED(src0);
-}
-
-// ggml_compute_forward_get_rows
-
-static void ggml_compute_forward_get_rows_q(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-    const enum ggml_type type = src0->type;
-    dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == GGML_TYPE_SIZE[type]);
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        dequantize_row_q(
-                (const void *) ((char *) src0->data + r*src0->nb[1]),
-                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);
-    }
-}
-
-static void ggml_compute_forward_get_rows_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(ggml_fp16_t));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        for (int j = 0; j < nc; ++j) {
-            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
-            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);
-        }
-    }
-}
-
-static void ggml_compute_forward_get_rows_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nelements(src1);
-
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
-
-        ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i*dst->nb[1]),
-                (float *) ((char *) src0->data + r*src0->nb[1]));
-    }
-}
-
-static void ggml_compute_forward_get_rows(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-            {
-                ggml_compute_forward_get_rows_q(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-
-    //static bool first = true;
-    //printf("ne0 = %d, ne1 = %d, ne2 = %d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
-    //if (first) {
-    //    first = false;
-    //} else {
-    //    for (int k = 0; k < dst->ne[1]; ++k) {
-    //        for (int j = 0; j < dst->ne[0]/16; ++j) {
-    //            for (int i = 0; i < 16; ++i) {
-    //                printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
-    //            }
-    //            printf("\n");
-    //        }
-    //        printf("\n");
-    //    }
-    //    printf("\n");
-    //    exit(0);
-    //}
-}
-
-// ggml_compute_forward_diag_mask_inf
-
-static void ggml_compute_forward_diag_mask_inf_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 1);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-
-    // TODO: handle transposed/permuted matrices
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-    const int nr = src0->ne[1];
-    const int nz = n/nr;
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int k = 0; k < nz; k++) {
-        for (int j = 0; j < nr; j++) {
-            for (int i = n_past; i < nc; i++) {
-                if (i > n_past + j) {
-                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_diag_mask_inf(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_soft_max
-
-static void ggml_compute_forward_soft_max_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // TODO: handle transposed/permuted matrices
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float *p = (float *)((char *) dst->data + i1*dst->nb[1]);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(p[i]));
-        }
-#endif
-
-        float max = -INFINITY;
-        ggml_vec_max_f32(nc, &max, p);
-
-        ggml_float sum = 0.0;
-
-        uint16_t scvt;
-        for (int i = 0; i < nc; i++) {
-            //printf("p[%3d] = %8.4f\n", i, p[i]);
-            if (p[i] == -INFINITY) {
-                p[i] = 0.0f;
-            } else {
-                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
-                ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
-                memcpy(&scvt, &s, sizeof(scvt));
-                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
-                sum += (ggml_float)val;
-                p[i] = val;
-            }
-        }
-
-        assert(sum > 0.0);
-
-        sum = 1.0/sum;
-        ggml_vec_scale_f32(nc, p, sum);
-
-#ifndef NDEBUG
-        for (int i = 0; i < nc; ++i) {
-            assert(!isnan(p[i]));
-            assert(!isinf(p[i]));
-        }
-#endif
-    }
-}
-
-static void ggml_compute_forward_soft_max(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_max_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_alibi
-
-static void ggml_compute_forward_alibi_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 2);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_head = ((int32_t *) src1->data)[1];
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
-    //const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    assert(nb0 == sizeof(float));
-    assert(ne1 + n_past == ne0); (void) n_past;
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
-
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
-                float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                float *      pdst = (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                float m_k;
-
-                if (k < n_heads_log2_floor) {
-                    m_k = powf(m0, k + 1);
-                } else {
-                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-
-                pdst[0] = (j+1) * m_k + src[0];
-            }
-        }
-    }
-}
-
-
-static void ggml_compute_forward_alibi_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 2);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_head = ((int32_t *) src1->data)[1];
-
-    const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
-    const int ne1 = src0->ne[1]; // seq_len_without_past
-    //const int ne2 = src0->ne[2]; // n_head -> this is k
-    //const int ne3 = src0->ne[3]; // 1 -> bsz
-
-    const int n  = ggml_nrows(src0);
-    const int ne2_ne3 = n/ne1; // ne2*ne3
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    //const int nb3 = src0->nb[3];
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-    assert(ne1 + n_past == ne0); (void) n_past;
-
-    // add alibi to src0 (KQ_scaled)
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
-
-    for (int i = 0; i < ne0; i++) {
-        for (int j = 0; j < ne1; j++) {
-            for (int k = 0; k < ne2_ne3; k++) {
-                ggml_fp16_t * const src  = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
-                      float *      pdst  =       (float *)((char *)  dst->data + i*nb0 + j*nb1 + k*nb2);
-
-                // TODO: k*nb2 or k*nb3
-
-                float m_k;
-
-                if (k < n_heads_log2_floor) {
-                    m_k = powf(m0, k + 1);
-                } else {
-                    m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
-                }
-
-                // we return F32
-                pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_alibi(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_alibi_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_alibi_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q4_2:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q8_1:
-        case GGML_TYPE_I8:
-        case GGML_TYPE_I16:
-        case GGML_TYPE_I32:
-        case GGML_TYPE_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_rope
-
-static void ggml_compute_forward_rope_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    //const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
-    const int64_t ne3 = src0->ne[3];
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta = (float)p;
-
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float cos_theta = cosf(theta);
-                    const float sin_theta = sinf(theta);
-
-                    theta *= theta_scale;
-
-                    if (!is_neox) {
-                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[1];
-
-                        dst_data[0] = x0*cos_theta - x1*sin_theta;
-                        dst_data[1] = x0*sin_theta + x1*cos_theta;
-                    } else {
-                        const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-                              float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-
-                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    assert(src1->type == GGML_TYPE_I32);
-    assert(ggml_nelements(src1) == 3);
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n_past = ((int32_t *) src1->data)[0];
-    const int n_dims = ((int32_t *) src1->data)[1];
-    const int mode   = ((int32_t *) src1->data)[2];
-
-    //const int64_t ne0 = src0->ne[0];
-    const int64_t ne1 = src0->ne[1];
-    const int64_t ne2 = src0->ne[2];
-    const int64_t ne3 = src0->ne[3];
-
-    const int nb0 = src0->nb[0];
-    const int nb1 = src0->nb[1];
-    const int nb2 = src0->nb[2];
-    const int nb3 = src0->nb[3];
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = ((mode & 1) == 0 ? 0 : n_past); i2 < ne2; i2++) {
-            const int p = ((mode & 1) == 0 ? n_past + i2 : i2);
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta = (float)p;
-
-                for (int i0 = 0; i0 < n_dims; i0 += 2) {
-                    const float cos_theta = cosf(theta);
-                    const float sin_theta = sinf(theta);
-
-                    theta *= theta_scale;
-
-                    if (!is_neox) {
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[1]);
-
-                        dst_data[0] = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    } else {
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (i0/2)*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_1d_1s
-
-static void ggml_compute_forward_conv_1d_1s_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_1s_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; ++i0) {
-            dst_data[i0] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_1s(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_conv_1d_2s
-
-static void ggml_compute_forward_conv_1d_2s_f16_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                ggml_fp16_t * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f16(ew0, &v,
-                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_2s_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    //const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    //const int64_t ne12 = src1->ne[2];
-    //const int64_t ne13 = src1->ne[3];
-
-    //const int64_t ne0  = dst->ne[0];
-    //const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-    //const int64_t ne   = ne0*ne1*ne2*ne3;
-
-    const int nb00 = src0->nb[0];
-    const int nb01 = src0->nb[1];
-    const int nb02 = src0->nb[2];
-    //const int nb03 = src0->nb[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    //const int nb12 = src1->nb[2];
-    //const int nb13 = src1->nb[3];
-
-    //const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    //const int nb2  = dst->nb[2];
-    //const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nk = ne00;
-    const int nh = nk/2;
-
-    const int ew0 = ggml_up32(ne01);
-
-    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
-
-    if (params->type == GGML_TASK_INIT) {
-        // TODO: fix this memset (wsize is overestimated)
-        memset(params->wdata, 0, params->wsize);
-
-        // prepare kernel data (src0)
-        {
-            float * const wdata = (float *) params->wdata + 0;
-
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    float * dst_data = wdata + i02*ew0*ne00;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ew0 + i01] = src[i00];
-                    }
-                }
-            }
-        }
-
-        // prepare source data (src1)
-        {
-            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;
-
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                float * dst_data = wdata;
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];
-                }
-            }
-        }
-
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // total rows in dst
-    const int nr = ne02;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dst_data = (float *)((char *) dst->data + i1*nb1);
-        for (int64_t i0 = 0; i0 < ne10; i0 += 2) {
-            dst_data[i0/2] = 0;
-            for (int k = -nh; k <= nh; k++) {
-                float v = 0.0f;
-                ggml_vec_dot_f32(ew0, &v,
-                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,
-                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);
-
-                dst_data[i0/2] += v;
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_conv_1d_2s(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_attn
-
-static void ggml_compute_forward_flash_attn_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(float));
-    GGML_ASSERT(nbk0 == sizeof(float));
-    GGML_ASSERT(nbv0 == sizeof(float));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        for (int64_t ic = 0; ic < nek1; ++ic) {
-            // k indices
-            const int ik3 = iq3;
-            const int ik2 = iq2;
-            const int ik1 = ic;
-
-            // S indices
-            const int i1 = ik1;
-
-            ggml_vec_dot_f32(neq0,
-                    S + i1,
-                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        for (int64_t ic = 0; ic < nev1; ++ic) {
-            // dst indices
-            const int i1 = iq1;
-            const int i2 = iq2;
-            const int i3 = iq3;
-
-            ggml_vec_dot_f32(nek1,
-                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                    (float *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                    S);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-             struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t neq0 = q->ne[0];
-    const int64_t neq1 = q->ne[1];
-    const int64_t neq2 = q->ne[2];
-    const int64_t neq3 = q->ne[3];
-
-    const int64_t nek0 = k->ne[0];
-    const int64_t nek1 = k->ne[1];
-    //const int64_t nek2 = k->ne[2];
-    //const int64_t nek3 = k->ne[3];
-
-    //const int64_t nev0 = v->ne[0];
-    const int64_t nev1 = v->ne[1];
-    //const int64_t nev2 = v->ne[2];
-    //const int64_t nev3 = v->ne[3];
-
-    const int64_t ne0  = dst->ne[0];
-    const int64_t ne1  = dst->ne[1];
-    //const int64_t ne2  = dst->ne[2];
-    //const int64_t ne3  = dst->ne[3];
-
-    const int nbk0 = k->nb[0];
-    const int nbk1 = k->nb[1];
-    const int nbk2 = k->nb[2];
-    const int nbk3 = k->nb[3];
-
-    const int nbq0 = q->nb[0];
-    const int nbq1 = q->nb[1];
-    const int nbq2 = q->nb[2];
-    const int nbq3 = q->nb[3];
-
-    const int nbv0 = v->nb[0];
-    const int nbv1 = v->nb[1];
-    const int nbv2 = v->nb[2];
-    const int nbv3 = v->nb[3];
-
-    const int nb0  = dst->nb[0];
-    const int nb1  = dst->nb[1];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = neq0;
-    const int64_t N = neq1;
-    const int64_t P = nek1 - N;
-    const int64_t M = P + N;
-
-    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);
-
-    GGML_ASSERT(ne0 == D);
-    GGML_ASSERT(ne1 == N);
-    GGML_ASSERT(P >= 0);
-
-    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
-
-    GGML_ASSERT(neq0 == D);
-    GGML_ASSERT(nek0 == D);
-    GGML_ASSERT(nev1 == D);
-
-    GGML_ASSERT(neq1 == N);
-    GGML_ASSERT(nek1 == N + P);
-    GGML_ASSERT(nev1 == D);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    const float scale = 1.0f/sqrtf(D);
-
-    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // q indices
-        const int iq3 = ir/(neq2*neq1);
-        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
-        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
-
-        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);
-
-        for (int i = M; i < Mup; ++i) {
-            S[i] = -INFINITY;
-        }
-
-        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
-            for (int64_t ic = 0; ic < nek1; ++ic) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16(neq0,
-                        S + i1,
-                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        } else {
-            for (int64_t ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
-                // k indices
-                const int ik3 = iq3;
-                const int ik2 = iq2;
-                const int ik1 = ic;
-
-                // S indices
-                const int i1 = ik1;
-
-                ggml_vec_dot_f16_unroll(neq0, nbk1,
-                        S + i1,
-                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
-                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
-            }
-        }
-
-        // scale
-        ggml_vec_scale_f32(nek1, S, scale);
-
-        if (masked) {
-            for (int64_t i = P; i < M; i++) {
-                if (i > P + iq1) {
-                    S[i] = -INFINITY;
-                }
-            }
-        }
-
-        // softmax
-        {
-            float max = -INFINITY;
-            ggml_vec_max_f32(M, &max, S);
-
-            ggml_float sum = 0.0;
-            {
-#ifdef GGML_SOFT_MAX_ACCELERATE
-                max = -max;
-                vDSP_vsadd(S, 1, &max, S, 1, Mup);
-                vvexpf(S, S, &Mup);
-                ggml_vec_sum_f32(Mup, &sum, S);
-#else
-                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];
-                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };
-
-                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {
-                    float * SS = S + i;
-
-                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {
-                        if (SS[j] == -INFINITY) {
-                            SS[j] = 0.0f;
-                        } else {
-                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);
-                            memcpy(&scvt[j], &s, sizeof(uint16_t));
-                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);
-                            sump[j] += (ggml_float)val;
-                            SS[j] = val;
-                        }
-                    }
-                }
-
-                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {
-                    sum += sump[i];
-                }
-#endif
-            }
-
-            assert(sum > 0.0);
-
-            sum = 1.0/sum;
-            ggml_vec_scale_f32(M, S, sum);
-
-#ifndef NDEBUG
-            for (int i = 0; i < M; ++i) {
-                assert(!isnan(S[i]));
-                assert(!isinf(S[i]));
-            }
-#endif
-        }
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
-            for (int64_t ic = 0; ic < nev1; ++ic) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                ggml_vec_dot_f16(nek1,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                        S16);
-            }
-        } else {
-            for (int64_t ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
-                // dst indices
-                const int i1 = iq1;
-                const int i2 = iq2;
-                const int i3 = iq3;
-
-                ggml_vec_dot_f16_unroll(nek1, nbv1,
-                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),
-                        ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),
-                        S16);
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_attn(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * q,
-        const struct ggml_tensor * k,
-        const struct ggml_tensor * v,
-        const bool masked,
-        struct ggml_tensor * dst) {
-    switch (q->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_flash_ff
-
-static void ggml_compute_forward_flash_ff_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,  // F16
-        const struct ggml_tensor * b0, // F16 fc_w
-        const struct ggml_tensor * b1, // F32 fc_b
-        const struct ggml_tensor * c0, // F16 proj_w
-        const struct ggml_tensor * c1, // F32 proj_b
-        struct ggml_tensor * dst) {
-    int64_t t0 = ggml_perf_time_us();
-    UNUSED(t0);
-
-    const int64_t nea0 = a->ne[0];
-    const int64_t nea1 = a->ne[1];
-    const int64_t nea2 = a->ne[2];
-    const int64_t nea3 = a->ne[3];
-
-    const int64_t neb00 = b0->ne[0];
-    const int64_t neb01 = b0->ne[1];
-    //const int64_t neb02 = b0->ne[2];
-    //const int64_t neb03 = b0->ne[3];
-
-    const int64_t neb10 = b1->ne[0];
-    const int64_t neb11 = b1->ne[1];
-    //const int64_t neb12 = b1->ne[2];
-    //const int64_t neb13 = b1->ne[3];
-
-    const int64_t nec00 = c0->ne[0];
-    const int64_t nec01 = c0->ne[1];
-    //const int64_t nec02 = c0->ne[2];
-    //const int64_t nec03 = c0->ne[3];
-
-    const int64_t nec10 = c1->ne[0];
-    const int64_t nec11 = c1->ne[1];
-    //const int64_t nec12 = c1->ne[2];
-    //const int64_t nec13 = c1->ne[3];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    //const int64_t ne3 = dst->ne[3];
-
-    const int nba0 = a->nb[0];
-    const int nba1 = a->nb[1];
-    const int nba2 = a->nb[2];
-    const int nba3 = a->nb[3];
-
-    const int nbb00 = b0->nb[0];
-    const int nbb01 = b0->nb[1];
-    const int nbb02 = b0->nb[2];
-    const int nbb03 = b0->nb[3];
-
-    const int nbb10 = b1->nb[0];
-    //const int nbb11 = b1->nb[1];
-    //const int nbb12 = b1->nb[2];
-    //const int nbb13 = b1->nb[3];
-
-    const int nbc00 = c0->nb[0];
-    const int nbc01 = c0->nb[1];
-    const int nbc02 = c0->nb[2];
-    const int nbc03 = c0->nb[3];
-
-    const int nbc10 = c1->nb[0];
-    //const int nbc11 = c1->nb[1];
-    //const int nbc12 = c1->nb[2];
-    //const int nbc13 = c1->nb[3];
-
-    const int nb0 = dst->nb[0];
-    const int nb1 = dst->nb[1];
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int64_t D = nea0;
-    //const int64_t N = nea1;
-    const int64_t M = neb01;
-
-    GGML_ASSERT(ne0 == nea0);
-    GGML_ASSERT(ne1 == nea1);
-    GGML_ASSERT(ne2 == nea2);
-
-    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbb10 == sizeof(float));
-    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nbc10 == sizeof(float));
-
-    GGML_ASSERT(neb00 == D);
-    GGML_ASSERT(neb01 == M);
-    GGML_ASSERT(neb10 == M);
-    GGML_ASSERT(neb11 == 1);
-
-    GGML_ASSERT(nec00 == M);
-    GGML_ASSERT(nec01 == D);
-    GGML_ASSERT(nec10 == D);
-    GGML_ASSERT(nec11 == 1);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    if (params->type == GGML_TASK_INIT) {
-        return;
-    }
-
-    if (params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // parallelize by a rows using ggml_vec_dot_f32
-
-    // total rows in a
-    const int nr = nea1*nea2*nea3;
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int ir = ir0; ir < ir1; ++ir) {
-        // a indices
-        const int ia3 = ir/(nea2*nea1);
-        const int ia2 = (ir - ia3*nea2*nea1)/nea1;
-        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);
-
-        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);
-
-        for (int64_t ic = 0; ic < neb01; ++ic) {
-            // b0 indices
-            const int ib03 = ia3;
-            const int ib02 = ia2;
-            const int ib01 = ic;
-
-            // S indices
-            const int i1 = ib01;
-
-            ggml_vec_dot_f16(nea0,
-                    S + i1,
-                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),
-                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));
-        }
-
-        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);
-        //ggml_vec_gelu_f32(neb01, S, S);
-
-        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);
-
-        for (int64_t i = 0; i < M; i++) {
-            S16[i] = GGML_FP32_TO_FP16(S[i]);
-        }
-
-        ggml_vec_gelu_f16(neb01, S16, S16);
-
-        {
-            // dst indices
-            const int i1 = ia1;
-            const int i2 = ia2;
-            const int i3 = ia3;
-
-            for (int64_t ic = 0; ic < nec01; ++ic) {
-
-                ggml_vec_dot_f16(neb01,
-                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),
-                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),
-                        S16);
-            }
-
-            ggml_vec_add_f32(nec01,
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),
-                    (float *) c1->data);
-        }
-    }
-}
-
-static void ggml_compute_forward_flash_ff(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * a,
-        const struct ggml_tensor * b0,
-        const struct ggml_tensor * b1,
-        const struct ggml_tensor * c0,
-        const struct ggml_tensor * c1,
-        struct ggml_tensor * dst) {
-    switch (b0->type) {
-        case GGML_TYPE_F16:
-            {
-                ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);
-            } break;
-        case GGML_TYPE_F32:
-            {
-                GGML_ASSERT(false); // TODO
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_unary
-
-static void ggml_compute_forward_map_unary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-
-static void ggml_compute_forward_map_unary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst,
-        const ggml_unary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_unary_f32(params, src0, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-// ggml_compute_forward_map_binary
-
-static void ggml_compute_forward_map_binary_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert( dst->nb[0] == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-    assert(src1->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        fun(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])),
-                (float *) ((char *) src1->data + i*(src1->nb[1])));
-    }
-}
-
-
-static void ggml_compute_forward_map_binary(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const ggml_binary_op_f32_t fun) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_map_binary_f32(params, src0, src1, dst, fun);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-/////////////////////////////////
-
-static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    GGML_ASSERT(params);
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                ggml_compute_forward_dup(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ADD:
-            {
-                ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SUB:
-            {
-                ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_MUL:
-            {
-                ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_DIV:
-            {
-                ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SQR:
-            {
-                ggml_compute_forward_sqr(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SQRT:
-            {
-                ggml_compute_forward_sqrt(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SUM:
-            {
-                ggml_compute_forward_sum(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_MEAN:
-            {
-                ggml_compute_forward_mean(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                ggml_compute_forward_repeat(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ABS:
-            {
-                ggml_compute_forward_abs(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SGN:
-            {
-                ggml_compute_forward_sgn(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_NEG:
-            {
-                ggml_compute_forward_neg(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_STEP:
-            {
-                ggml_compute_forward_step(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RELU:
-            {
-                ggml_compute_forward_relu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_GELU:
-            {
-                ggml_compute_forward_gelu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_SILU:
-            {
-                ggml_compute_forward_silu(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_NORM:
-            {
-                ggml_compute_forward_norm(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                ggml_compute_forward_rms_norm(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SCALE:
-            {
-                ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_compute_forward_cpy(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_CONT:
-            {
-                ggml_compute_forward_cont(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                ggml_compute_forward_reshape(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_VIEW:
-            {
-                ggml_compute_forward_view(params, tensor->src0);
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                ggml_compute_forward_permute(params, tensor->src0);
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                ggml_compute_forward_transpose(params, tensor->src0);
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                ggml_compute_forward_soft_max(params, tensor->src0, tensor);
-            } break;
-        case GGML_OP_ROPE:
-            {
-                ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CONV_1D_1S:
-            {
-                ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_CONV_1D_2S:
-            {
-                ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
-                GGML_ASSERT(t == 0 || t == 1);
-                bool masked = t != 0;
-                ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
-            } break;
-        case GGML_OP_MAP_UNARY:
-            {
-                const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_unary(params, tensor->src0, tensor, fun);
-            }
-            break;
-        case GGML_OP_MAP_BINARY:
-            {
-                const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->opt[0]->data);
-                ggml_compute_forward_map_binary(params, tensor->src0, tensor->src1, tensor, fun);
-            }
-            break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {
-    struct ggml_tensor * src0 = tensor->src0;
-    struct ggml_tensor * src1 = tensor->src1;
-
-    switch (tensor->op) {
-        case GGML_OP_DUP:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_ADD:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_SUB:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-                if (src1->grad) {
-                    src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_MUL:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx, src1, tensor->grad),
-                                inplace);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_impl(ctx,
-                                src1->grad,
-                                ggml_mul(ctx, src0, tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_DIV:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_div(ctx, tensor->grad, src1),
-                                inplace);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_sub_impl(ctx,
-                                src1->grad,
-                                ggml_mul(ctx,
-                                    tensor->grad,
-                                    ggml_div(ctx, tensor, src1)),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SQR:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    ggml_mul(ctx, src0, tensor->grad),
-                                    ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SQRT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_div(ctx,
-                                    ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),
-                                    tensor),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SUM:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_repeat(ctx, tensor->grad, src0->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_MEAN:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
-        case GGML_OP_REPEAT:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_sum(ctx, tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_ABS:
-            {
-                if (src0->grad) {
-                    src0->grad =
-                        ggml_add_impl(ctx,
-                                src0->grad,
-                                ggml_mul(ctx,
-                                    ggml_sgn(ctx, src0),
-                                    tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SGN:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_NEG:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);
-                }
-            } break;
-        case GGML_OP_STEP:
-            {
-                if (src0->grad) {
-                    // noop
-                }
-            } break;
-        case GGML_OP_RELU:
-            {
-                if (src0->grad) {
-                    src0->grad = ggml_sub_impl(ctx,
-                            src0->grad,
-                            ggml_mul(ctx,
-                                ggml_step(ctx, src0),
-                                tensor->grad),
-                            inplace);
-                }
-            } break;
-        case GGML_OP_GELU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ALIBI:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SILU:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RMS_NORM:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_MUL_MAT:
-            {
-                if (src0->grad) {
-                    // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad);
-                    GGML_ASSERT(false);
-                }
-                if (src1->grad) {
-                    src1->grad =
-                        ggml_add_impl(ctx,
-                                src1->grad,
-                                ggml_mul_mat(ctx,
-                                    ggml_cont(ctx, ggml_transpose(ctx, src0)),
-                                    tensor->grad),
-                                inplace);
-                }
-            } break;
-        case GGML_OP_SCALE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CPY:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONT:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_RESHAPE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_VIEW:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_PERMUTE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_TRANSPOSE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_DIAG_MASK_INF:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_SOFT_MAX:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_ROPE:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_1S:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_CONV_1D_2S:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case GGML_OP_FLASH_ATTN:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_FLASH_FF:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_MAP_UNARY:
-        case GGML_OP_MAP_BINARY:
-            {
-                GGML_ASSERT(false); // not supported
-            } break;
-        case GGML_OP_NONE:
-            {
-                // nop
-            } break;
-        case GGML_OP_COUNT:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
-    if (node->grad == NULL) {
-        // this usually happens when we generate intermediate nodes from constants in the backward pass
-        // it can also happen during forward pass, if the user performs computations with constants
-        if (node->op != GGML_OP_NONE) {
-            //GGML_PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node->op);
-        }
-    }
-
-    // check if already visited
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return;
-        }
-    }
-
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        if (cgraph->leafs[i] == node) {
-            return;
-        }
-    }
-
-    if (node->src0) {
-        ggml_visit_parents(cgraph, node->src0);
-    }
-
-    if (node->src1) {
-        ggml_visit_parents(cgraph, node->src1);
-    }
-
-    for (int i = 0; i < GGML_MAX_OPT; ++i) {
-        if (node->opt[i]) {
-            ggml_visit_parents(cgraph, node->opt[i]);
-        }
-    }
-
-    if (node->op == GGML_OP_NONE && node->grad == NULL) {
-        // reached a leaf node, not part of the gradient graph (e.g. a constant)
-        GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);
-
-        cgraph->leafs[cgraph->n_leafs] = node;
-        cgraph->n_leafs++;
-    } else {
-        GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);
-
-        cgraph->nodes[cgraph->n_nodes] = node;
-        cgraph->grads[cgraph->n_nodes] = node->grad;
-        cgraph->n_nodes++;
-    }
-}
-
-static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
-    if (!expand) {
-        cgraph->n_nodes = 0;
-        cgraph->n_leafs = 0;
-    }
-
-    const int n0 = cgraph->n_nodes;
-    UNUSED(n0);
-
-    ggml_visit_parents(cgraph, tensor);
-
-    const int n_new = cgraph->n_nodes - n0;
-    GGML_PRINT_DEBUG("%s: visited %d new nodes\n", __func__, n_new);
-
-    if (n_new > 0) {
-        // the last added node should always be starting point
-        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);
-    }
-}
-
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
-    ggml_build_forward_impl(cgraph, tensor, true);
-}
-
-struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
-    struct ggml_cgraph result = {
-        /*.n_nodes      =*/ 0,
-        /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
-        /*.nodes        =*/ { NULL },
-        /*.grads        =*/ { NULL },
-        /*.leafs        =*/ { NULL },
-        /*.perf_runs    =*/ 0,
-        /*.perf_cycles  =*/ 0,
-        /*.perf_time_us =*/ 0,
-    };
-
-    ggml_build_forward_impl(&result, tensor, false);
-
-    return result;
-}
-
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {
-    struct ggml_cgraph result = *gf;
-
-    GGML_ASSERT(gf->n_nodes > 0);
-
-    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
-    if (keep) {
-        for (int i = 0; i < gf->n_nodes; i++) {
-            struct ggml_tensor * node = gf->nodes[i];
-
-            if (node->grad) {
-                node->grad = ggml_dup_tensor(ctx, node);
-                gf->grads[i] = node->grad;
-            }
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        // because we detached the grad nodes from the original graph, we can afford inplace operations
-        if (node->grad) {
-            ggml_compute_backward(ctx, node, keep);
-        }
-    }
-
-    for (int i = gf->n_nodes - 1; i >= 0; i--) {
-        struct ggml_tensor * node = gf->nodes[i];
-
-        if (node->is_param) {
-            GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
-            ggml_build_forward_impl(&result, node->grad, true);
-        }
-    }
-
-    return result;
-}
-
-//
-// thread data
-//
-// synchronization is done via busy loops
-// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops
-//
-
-#ifdef __APPLE__
-
-//#include <os/lock.h>
-//
-//typedef os_unfair_lock ggml_lock_t;
-//
-//#define ggml_lock_init(x)    UNUSED(x)
-//#define ggml_lock_destroy(x) UNUSED(x)
-//#define ggml_lock_lock       os_unfair_lock_lock
-//#define ggml_lock_unlock     os_unfair_lock_unlock
-//
-//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#define ggml_lock_lock(x)    UNUSED(x)
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#else
-
-//typedef pthread_spinlock_t ggml_lock_t;
-
-//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)
-//#define ggml_lock_destroy pthread_spin_destroy
-//#define ggml_lock_lock    pthread_spin_lock
-//#define ggml_lock_unlock  pthread_spin_unlock
-
-typedef int ggml_lock_t;
-
-#define ggml_lock_init(x)    UNUSED(x)
-#define ggml_lock_destroy(x) UNUSED(x)
-#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
-#define ggml_lock_lock(x)    _mm_pause()
-#else
-#define ggml_lock_lock(x)    UNUSED(x)
-#endif
-#define ggml_lock_unlock(x)  UNUSED(x)
-
-#define GGML_LOCK_INITIALIZER 0
-
-typedef pthread_t ggml_thread_t;
-
-#define ggml_thread_create pthread_create
-#define ggml_thread_join   pthread_join
-
-#endif
-
-struct ggml_compute_state_shared {
-    ggml_lock_t spin;
-
-    int n_threads;
-
-    // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
-};
-
-struct ggml_compute_state {
-    ggml_thread_t thrd;
-
-    struct ggml_compute_params params;
-    struct ggml_tensor * node;
-
-    struct ggml_compute_state_shared * shared;
-};
-
-static thread_ret_t ggml_graph_compute_thread(void * data) {
-    struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
-    const int n_threads = state->shared->n_threads;
-
-    while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
-            }
-        }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
-            }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
-        }
-
-        // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
-        } else {
-            break;
-        }
-    }
-
-    return 0;
-}
-
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
-
-    struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
-        /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
-    };
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
-
-    // create thread pool
-    if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-                .shared = &state_shared,
-            };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
-
-    // initialize tasks + work buffer
-    {
-        size_t work_size = 0;
-
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
-
-            switch (node->op) {
-                case GGML_OP_CPY:
-                case GGML_OP_DUP:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ADD:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SUB:
-                case GGML_OP_MUL:
-                case GGML_OP_DIV:
-                case GGML_OP_SQR:
-                case GGML_OP_SQRT:
-                case GGML_OP_SUM:
-                case GGML_OP_MEAN:
-                case GGML_OP_REPEAT:
-                case GGML_OP_ABS:
-                case GGML_OP_SGN:
-                case GGML_OP_NEG:
-                case GGML_OP_STEP:
-                case GGML_OP_RELU:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_GELU:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_SILU:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_NORM:
-                case GGML_OP_RMS_NORM:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_MUL_MAT:
-                    {
-                        node->n_tasks = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
-
-                        size_t cur = 0;
-
-#if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                            cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
-#endif
-                        if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1; // TODO: this actually is doing nothing
-                                                   //       the threads are still spinning
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else {
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-                            }
-#else
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
-#endif
-                        } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
-                            cur = 0;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                            }
-#endif
-                        } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                node->n_tasks = 1;
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            } else
-#endif
-                            {
-                                const enum ggml_type type_q = quantize_fns[node->src0->type].vec_dot_type;
-                                cur = GGML_TYPE_SIZE[type_q]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[type_q];
-                            }
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_CONT:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_PERMUTE:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_GET_ROWS:
-                case GGML_OP_DIAG_MASK_INF:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_SOFT_MAX:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ROPE:
-                    {
-                        node->n_tasks = n_threads;
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        node->n_tasks = 1; //TODO
-                    } break;
-                case GGML_OP_CONV_1D_1S:
-                case GGML_OP_CONV_1D_2S:
-                    {
-                        node->n_tasks = n_threads;
-
-                        GGML_ASSERT(node->src0->ne[3] == 1);
-                        GGML_ASSERT(node->src1->ne[2] == 1);
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        size_t cur = 0;
-                        const int nk = node->src0->ne[0];
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
-                            node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else {
-                            GGML_ASSERT(false);
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_FF:
-                    {
-                        node->n_tasks = n_threads;
-
-                        size_t cur = 0;
-
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
-                        }
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_MAP_UNARY:
-                case GGML_OP_MAP_BINARY:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_NONE:
-                    {
-                        node->n_tasks = 1;
-                    } break;
-                case GGML_OP_COUNT:
-                    {
-                        GGML_ASSERT(false);
-                    } break;
-            }
-        }
-
-        if (cgraph->work != NULL && work_size > cgraph->work_size) {
-            GGML_ASSERT(false); // TODO: better handling
-        }
-
-        if (work_size > 0 && cgraph->work == NULL) {
-            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
-
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
-            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
-        }
-    }
-
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
-
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
-        //if (node->grad == NULL && node->perf_runs > 0) {
-        //    continue;
-        //}
-
-        const int64_t perf_node_start_cycles  = ggml_perf_cycles();
-        const int64_t perf_node_start_time_us = ggml_perf_time_us();
-
-        // INIT
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
-        // COMPUTE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // FINALIZE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_FINALIZE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_FINALIZE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // performance stats (node)
-        {
-            int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;
-            int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
-
-            node->perf_runs++;
-            node->perf_cycles  += perf_cycles_cur;
-            node->perf_time_us += perf_time_us_cur;
-        }
-    }
-
-    // join thread pool
-    if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-
-        ggml_lock_destroy(&state_shared.spin);
-    }
-
-    // performance stats (graph)
-    {
-        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;
-        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;
-
-        cgraph->perf_runs++;
-        cgraph->perf_cycles  += perf_cycles_cur;
-        cgraph->perf_time_us += perf_time_us_cur;
-
-        GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n",
-                __func__, cgraph->perf_runs,
-                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),
-                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,
-                (double) perf_time_us_cur     / 1000.0,
-                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);
-    }
-}
-
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            ggml_set_zero(grad);
-        }
-    }
-}
-
-void ggml_graph_print(const struct ggml_cgraph * cgraph) {
-    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};
-
-    GGML_PRINT("=== GRAPH ===\n");
-
-    GGML_PRINT_DEBUG("n_threads       = %d\n",        cgraph->n_threads);
-    GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
-
-    GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us);
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
-                i,
-                node->ne[0], node->ne[1], node->ne[2],
-                GGML_OP_LABEL[node->op], node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),
-                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
-                (double) node->perf_time_us / 1000.0,
-                (double) node->perf_time_us / 1000.0 / node->perf_runs);
-    }
-
-    GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
-    for (int i = 0; i < cgraph->n_leafs; i++) {
-        struct ggml_tensor * node = cgraph->leafs[i];
-
-        GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
-                i,
-                node->ne[0], node->ne[1],
-                GGML_OP_LABEL[node->op]);
-    }
-
-    for (int i = 0; i < GGML_OP_COUNT; i++) {
-        if (perf_total_per_op_us[i] == 0) {
-            continue;
-        }
-
-        GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);
-    }
-
-    GGML_PRINT("========================================\n");
-}
-
-// check if node is part of the graph
-static bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    if (cgraph == NULL) {
-        return true;
-    }
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (cgraph->nodes[i] == node) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * parent = cgraph->nodes[i];
-
-        if (parent->grad == node) {
-            return parent;
-        }
-    }
-
-    return NULL;
-}
-
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {
-    char color[16];
-
-    FILE * fp = fopen(filename, "w");
-    GGML_ASSERT(fp);
-
-    fprintf(fp, "digraph G {\n");
-    fprintf(fp, "  newrank = true;\n");
-    fprintf(fp, "  rankdir = LR;\n");
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        if (ggml_graph_get_parent(gb, node) != NULL) {
-            continue;
-        }
-
-        if (node->is_param) {
-            snprintf(color, sizeof(color), "yellow");
-        } else if (node->grad) {
-            if (ggml_graph_find(gf, node)) {
-                snprintf(color, sizeof(color), "green");
-            } else {
-                snprintf(color, sizeof(color), "lightblue");
-            }
-        } else {
-            snprintf(color, sizeof(color), "white");
-        }
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-            fprintf(fp, "%s |", node->name);
-        }
-
-        fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
-                i, node->ne[0], node->ne[1],
-                GGML_OP_SYMBOL[node->op]);
-
-        if (node->grad) {
-            fprintf(fp, " | <g>%s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]);
-        } else {
-            fprintf(fp, "\"; ]\n");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        snprintf(color, sizeof(color), "pink");
-
-        fprintf(fp, "  \"%p\" [ "
-                    "style = filled; fillcolor = %s; shape = record; "
-                    "label=\"<x>",
-                (void *) node, color);
-
-        if (strlen(node->name) > 0) {
-                fprintf(fp, "%s | ", node->name);
-        }
-        if (ggml_nelements(node) == 1) {
-            if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
-                fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
-            }
-            else {
-                fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
-            }
-        }
-        else {
-            fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
-        }
-        fprintf(fp, "\"; ]\n");
-    }
-
-    for (int i = 0; i < gb->n_nodes; i++) {
-        struct ggml_tensor * node = gb->nodes[i];
-
-        struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);
-
-        if (node->src0) {
-            struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);
-
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n",
-                    parent0 ? (void *) parent0 : (void *) node->src0,
-                    parent0 ? "g" : "x",
-                    parent ? (void *) parent : (void *) node,
-                    parent ? "g" : "x",
-                    parent ? "empty" : "vee",
-                    parent ? "dashed" : "solid");
-        }
-
-        if (node->src1) {
-            struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);
-
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n",
-                    parent1 ? (void *) parent1 : (void *) node->src1,
-                    parent1 ? "g" : "x",
-                    parent ? (void *) parent : (void *) node,
-                    parent ? "g" : "x",
-                    parent ? "empty" : "vee",
-                    parent ? "dashed" : "solid");
-        }
-    }
-
-    for (int i = 0; i < gb->n_leafs; i++) {
-        struct ggml_tensor * node = gb->leafs[i];
-
-        if (node->src0) {
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n",
-                    (void *) node->src0, "x",
-                    (void *) node, "x");
-        }
-
-        if (node->src1) {
-            fprintf(fp, "  \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n",
-                    (void *) node->src1, "x",
-                    (void *) node, "x");
-        }
-    }
-
-    fprintf(fp, "}\n");
-
-    fclose(fp);
-
-    GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-static void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to set tensor from array
-        for (int64_t j = 0; j < ne; ++j) {
-            ggml_set_f32_1d(ps[p], j, x[i++]);
-        }
-    }
-}
-
-static void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            x[i++] = ggml_get_f32_1d(ps[p], j);
-        }
-    }
-}
-
-static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {
-    int i = 0;
-    for (int p = 0; p < np; ++p) {
-        const int64_t ne = ggml_nelements(ps[p]) ;
-        // TODO: add function to get all elements at once
-        for (int64_t j = 0; j < ne; ++j) {
-            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);
-        }
-    }
-}
-
-//
-// ADAM
-//
-//   ref: https://arxiv.org/pdf/1412.6980.pdf
-//
-
-static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
-    GGML_ASSERT(ggml_is_scalar(f));
-
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    // constants
-    const float alpha = params.adam.alpha;
-    const float beta1 = params.adam.beta1;
-    const float beta2 = params.adam.beta2;
-    const float eps   = params.adam.eps;
-
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters
-    float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient
-    float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared
-    float * m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment
-    float * v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment
-    float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat
-    float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat
-
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
-
-    // initialize
-    ggml_vec_set_f32(nx, m, 0.0f);
-    ggml_vec_set_f32(nx, v, 0.0f);
-
-    // update view
-    ggml_opt_get_params(np, ps, x);
-
-    // compute the function value
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
-
-    float fx_prev = ggml_get_f32_1d(f, 0);
-    if (pf) {
-        pf[0] = fx_prev;
-    }
-
-    int n_no_improvement = 0;
-    float fx_best = fx_prev;
-
-    // run the optimizer
-    for (int t = 0; t < params.adam.n_iter; ++t) {
-        GGML_PRINT_DEBUG  ("=== iter %d ===\n", t);
-
-        GGML_PRINT_DEBUG  ("f      = %10.6f\n", ggml_get_f32_1d(f, 0));
-        GGML_PRINT_DEBUG_5("df/dx0 = %10.6f\n", ggml_get_f32_1d(ps[0]->grad, 0));
-        GGML_PRINT_DEBUG_5("df/dx1 = %10.6f\n", ggml_get_f32_1d(ps[1]->grad, 0));
-
-        for (int i = 0; i < np; ++i) {
-            GGML_PRINT_DEBUG("param %d: %10.6f, g = %10.6f\n", i,
-                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));
-        }
-
-        const int64_t t_start_wall = ggml_time_us();
-        const int64_t t_start_cpu = ggml_cycles();
-        UNUSED(t_start_wall);
-        UNUSED(t_start_cpu);
-
-        {
-            // update the gradient
-            ggml_opt_get_grad(np, ps, g1);
-
-            // m_t = beta1*m_t-1 + (1 - beta1)*g_t
-            ggml_vec_scale_f32(nx, m, beta1);
-            ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);
-
-            // g2 = g1^2
-            ggml_vec_sqr_f32  (nx, g2, g1);
-
-            // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2
-            ggml_vec_scale_f32(nx, v, beta2);
-            ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);
-
-            // m^hat = m_t / (1 - beta1^t)
-            // v^hat = v_t / (1 - beta2^t)
-            // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps)
-            ggml_vec_cpy_f32  (nx, mh, m);
-            ggml_vec_cpy_f32  (nx, vh, v);
-
-            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1)));
-            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, t + 1)));
-
-            ggml_vec_sqrt_f32 (nx, vh, vh);
-            ggml_vec_acc1_f32 (nx, vh, eps);
-
-            ggml_vec_div_f32  (nx, mh, mh, vh);
-            ggml_vec_sub_f32  (nx, x,  x,  mh);
-
-            // update the parameters
-            ggml_opt_set_params(np, ps, x);
-        }
-
-        ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
-
-        const float fx = ggml_get_f32_1d(f, 0);
-
-        // check convergence
-        if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) {
-            GGML_PRINT_DEBUG("converged\n");
-
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= t) {
-                const float rate = (pf[t%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[t%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx_best > fx) {
-                fx_best = fx;
-                n_no_improvement = 0;
-            } else {
-                ++n_no_improvement;
-
-                if (n_no_improvement >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        fx_prev = fx;
-
-        {
-            const int64_t t_end_cpu = ggml_cycles();
-            GGML_PRINT_DEBUG("time iter:      %5.3f s\n", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);
-            UNUSED(t_end_cpu);
-
-            const int64_t t_end_wall = ggml_time_us();
-            GGML_PRINT_DEBUG("wall time iter: %5.3f s\n", (t_end_wall - t_start_wall)/1e6);
-            UNUSED(t_end_wall);
-        }
-    }
-
-    return GGML_OPT_DID_NOT_CONVERGE;
-}
-
-//
-// L-BFGS
-//
-// the L-BFGS implementation below is based on the following implementation:
-//
-//   https://github.com/chokkan/liblbfgs
-//
-
-struct ggml_lbfgs_iteration_data {
-    float alpha;
-    float ys;
-    float * s;
-    float * y;
-};
-
-static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
-        const struct ggml_opt_params * params,
-        int nx,
-        float * x,
-        float * fx,
-        float * g,
-        float * d,
-        float * step,
-        const float * xp,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb,
-        const int np,
-        struct ggml_tensor * ps[]) {
-    int count = 0;
-
-    float width  = 0.0f;
-    float dg     = 0.0f;
-    float finit  = 0.0f;
-    float dginit = 0.0f;
-    float dgtest = 0.0f;
-
-    const float dec = 0.5f;
-    const float inc = 2.1f;
-
-    if (*step <= 0.f) {
-        return GGML_LINESEARCH_INVALID_PARAMETERS;
-    }
-
-    // compute the initial gradient in the search direction
-    ggml_vec_dot_f32(nx, &dginit, g, d);
-
-    // make sure that d points to a descent direction
-    if (0 < dginit) {
-        return GGML_LINESEARCH_FAIL;
-    }
-
-    // initialize local variables
-    finit = *fx;
-    dgtest = params->lbfgs.ftol*dginit;
-
-    while (true) {
-        ggml_vec_cpy_f32(nx, x, xp);
-        ggml_vec_mad_f32(nx, x, d, *step);
-
-        // evaluate the function and gradient values
-        {
-            ggml_opt_set_params(np, ps, x);
-
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
-
-            ggml_opt_get_grad(np, ps, g);
-
-            *fx = ggml_get_f32_1d(f, 0);
-        }
-
-        ++count;
-
-        if (*fx > finit + (*step)*dgtest) {
-            width = dec;
-        } else {
-            // Armijo condition is satisfied
-            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {
-                return count;
-            }
-
-            ggml_vec_dot_f32(nx, &dg, g, d);
-
-            // check the Wolfe condition
-            if (dg < params->lbfgs.wolfe * dginit) {
-                width = inc;
-            } else {
-                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {
-                    // regular Wolfe conditions
-                    return count;
-                }
-
-                if(dg > -params->lbfgs.wolfe*dginit) {
-                    width = dec;
-                } else {
-                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
-                    return count;
-                }
-                return count;
-            }
-        }
-
-        if (*step < params->lbfgs.min_step) {
-            return GGML_LINESEARCH_MINIMUM_STEP;
-        }
-        if (*step > params->lbfgs.max_step) {
-            return GGML_LINESEARCH_MAXIMUM_STEP;
-        }
-        if (params->lbfgs.max_linesearch <= count) {
-            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;
-        }
-
-        (*step) *= width;
-    }
-
-    return GGML_LINESEARCH_FAIL;
-}
-
-static enum ggml_opt_result ggml_opt_lbfgs(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f,
-        struct ggml_cgraph * gf,
-        struct ggml_cgraph * gb) {
-    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||
-        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {
-        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) {
-            return GGML_OPT_INVALID_WOLFE;
-        }
-    }
-
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
-    const int m = params.lbfgs.m;
-
-    // these will store the parameters we want to optimize
-    struct ggml_tensor * ps[GGML_MAX_PARAMS];
-
-    int np = 0;
-    int nx = 0;
-    for (int i = 0; i < gf->n_nodes; ++i) {
-        if (gf->nodes[i]->is_param) {
-            GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
-
-            GGML_ASSERT(np < GGML_MAX_PARAMS);
-
-            ps[np++] = gf->nodes[i];
-            nx += ggml_nelements(gf->nodes[i]);
-        }
-    }
-
-    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters
-    float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters
-    float * g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient
-    float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient
-    float * d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction
-
-    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values
-
-    float fx    = 0.0f; // cost function value
-    float xnorm = 0.0f; // ||x||
-    float gnorm = 0.0f; // ||g||
-    float step  = 0.0f;
-
-    // initialize x from the graph nodes
-    ggml_opt_get_params(np, ps, x);
-
-    // the L-BFGS memory
-    struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m);
-
-    for (int i = 0; i < m; ++i) {
-        lm[i].alpha = 0.0f;
-        lm[i].ys    = 0.0f;
-        lm[i].s     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-        lm[i].y     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;
-    }
-
-    // evaluate the function value and its gradient
-    {
-        ggml_opt_set_params(np, ps, x);
-
-        ggml_graph_reset  (gf);
-        ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
-
-        ggml_opt_get_grad(np, ps, g);
-
-        fx = ggml_get_f32_1d(f, 0);
-    }
-
-    if (pf) {
-        pf[0] = fx;
-    }
-
-    float fx_best = fx;
-
-    // search direction = -gradient
-    ggml_vec_neg_f32(nx, d, g);
-
-    // ||x||, ||g||
-    ggml_vec_norm_f32(nx, &xnorm, x);
-    ggml_vec_norm_f32(nx, &gnorm, g);
-
-    if (xnorm < 1.0f) {
-        xnorm = 1.0f;
-    }
-
-    // already optimized
-    if (gnorm/xnorm <= params.lbfgs.eps) {
-        return GGML_OPT_OK;
-    }
-
-    // initial step
-    ggml_vec_norm_inv_f32(nx, &step, d);
-
-    int j                = 0;
-    int k                = 1;
-    int ls               = 0;
-    int end              = 0;
-    int bound            = 0;
-    int n_no_improvement = 0;
-
-    float ys   = 0.0f;
-    float yy   = 0.0f;
-    float beta = 0.0f;
-
-    while (true) {
-        // store the current position and gradient vectors
-        ggml_vec_cpy_f32(nx, xp, x);
-        ggml_vec_cpy_f32(nx, gp, g);
-
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps);
-
-        if (ls < 0) {
-            // linesearch failed - go back to the previous point and return
-            ggml_vec_cpy_f32(nx, x, xp);
-            ggml_vec_cpy_f32(nx, g, gp);
-
-            return ls;
-        }
-
-        ggml_vec_norm_f32(nx, &xnorm, x);
-        ggml_vec_norm_f32(nx, &gnorm, g);
-
-        GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0));
-
-        if (xnorm < 1.0f) {
-            xnorm = 1.0f;
-        }
-        if (gnorm/xnorm <= params.lbfgs.eps) {
-            // converged
-            return GGML_OPT_OK;
-        }
-
-        // delta-based convergence test
-        if (pf != NULL) {
-            // need at least params.past iterations to start checking for convergence
-            if (params.past <= k) {
-                const float rate = (pf[k%params.past] - fx)/fx;
-
-                if (fabsf(rate) < params.delta) {
-                    return GGML_OPT_OK;
-                }
-            }
-
-            pf[k%params.past] = fx;
-        }
-
-        // check for improvement
-        if (params.max_no_improvement > 0) {
-            if (fx < fx_best) {
-                fx_best = fx;
-                n_no_improvement = 0;
-            } else {
-                n_no_improvement++;
-
-                if (n_no_improvement >= params.max_no_improvement) {
-                    return GGML_OPT_OK;
-                }
-            }
-        }
-
-        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) {
-            // reached the maximum number of iterations
-            return GGML_OPT_DID_NOT_CONVERGE;
-        }
-
-        // update vectors s and y:
-        //   s_{k+1} = x_{k+1} - x_{k} = \step * d_{k}.
-        //   y_{k+1} = g_{k+1} - g_{k}.
-        //
-        ggml_vec_sub_f32(nx, lm[end].s, x, xp);
-        ggml_vec_sub_f32(nx, lm[end].y, g, gp);
-
-        // compute scalars ys and yy:
-        //     ys = y^t \cdot s    -> 1 / \rho.
-        //     yy = y^t \cdot y.
-        //
-        ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s);
-        ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y);
-
-        lm[end].ys = ys;
-
-        // find new search direction
-        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS
-
-        bound = (m <= k) ? m : k;
-        k++;
-        end = (end + 1)%m;
-
-        // initialize search direction with -g
-        ggml_vec_neg_f32(nx, d, g);
-
-        j = end;
-        for (int i = 0; i < bound; ++i) {
-            j = (j + m - 1) % m;
-            // \alpha_{j} = \rho_{j} s^{t}_{j} \cdot q_{k+1}
-            ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d);
-            lm[j].alpha /= lm[j].ys;
-            // q_{i} = q_{i+1} - \alpha_{i} y_{i}
-            ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha);
-        }
-
-        ggml_vec_scale_f32(nx, d, ys/yy);
-
-        for (int i = 0; i < bound; ++i) {
-            // \beta_{j} = \rho_{j} y^t_{j} \cdot \gamma_{i}
-            ggml_vec_dot_f32(nx, &beta, lm[j].y, d);
-            beta /= lm[j].ys;
-            // \gamma_{i+1} = \gamma_{i} + (\alpha_{j} - \beta_{j}) s_{j}
-            ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta);
-            j = (j + 1)%m;
-        }
-
-        step = 1.0;
-    }
-
-    return GGML_OPT_DID_NOT_CONVERGE;
-}
-
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
-    struct ggml_opt_params result;
-
-    switch (type) {
-        case GGML_OPT_ADAM:
-            {
-                result = (struct ggml_opt_params) {
-                    .type      = GGML_OPT_ADAM,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
-
-                    .max_no_improvement = 100,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .adam = {
-                        .n_iter = 10000,
-                        .alpha  = 0.001f,
-                        .beta1  = 0.9f,
-                        .beta2  = 0.999f,
-                        .eps    = 1e-8f,
-                        .eps_f  = 1e-5f,
-                        .eps_g  = 1e-3f,
-                    },
-                };
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = (struct ggml_opt_params) {
-                    .type      = GGML_OPT_LBFGS,
-                    .n_threads = 1,
-                    .past      = 0,
-                    .delta     = 1e-5f,
-
-                    .max_no_improvement = 0,
-
-                    .print_forward_graph  = true,
-                    .print_backward_graph = true,
-
-                    .lbfgs = {
-                        .m              = 6,
-                        .n_iter         = 100,
-                        .max_linesearch = 20,
-
-                        .eps      = 1e-5f,
-                        .ftol     = 1e-4f,
-                        .wolfe    = 0.9f,
-                        .min_step = 1e-20f,
-                        .max_step = 1e+20f,
-
-                        .linesearch = GGML_LINESEARCH_DEFAULT,
-                    },
-                };
-            } break;
-    }
-
-    return result;
-}
-
-enum ggml_opt_result ggml_opt(
-        struct ggml_context * ctx,
-        struct ggml_opt_params params,
-        struct ggml_tensor * f) {
-    bool free_ctx = false;
-    if (ctx == NULL) {
-        struct ggml_init_params params_ctx = {
-            .mem_size   = 16*1024*1024,
-            .mem_buffer = NULL,
-            .no_alloc   = false,
-        };
-
-        ctx = ggml_init(params_ctx);
-        if (ctx == NULL) {
-            return GGML_OPT_NO_CONTEXT;
-        }
-
-        free_ctx = true;
-    }
-
-    enum ggml_opt_result result = GGML_OPT_OK;
-
-    // build forward + backward compute graphs
-    struct ggml_cgraph *gf, *gb;
-    gf = (struct ggml_cgraph *)malloc(sizeof(struct ggml_cgraph));
-    gb = (struct ggml_cgraph *)malloc(sizeof(struct ggml_cgraph));
-
-    *gf = ggml_build_forward(f);
-    *gb = ggml_build_backward(ctx, gf, false);
-
-    switch (params.type) {
-        case GGML_OPT_ADAM:
-            {
-                result = ggml_opt_adam(ctx, params, f, gf, gb);
-            } break;
-        case GGML_OPT_LBFGS:
-            {
-                result = ggml_opt_lbfgs(ctx, params, f, gf, gb);
-            } break;
-    }
-
-    if (params.print_forward_graph) {
-        ggml_graph_print   (gf);
-        ggml_graph_dump_dot(gf, NULL, "opt-forward.dot");
-    }
-
-    if (params.print_backward_graph) {
-        ggml_graph_print   (gb);
-        ggml_graph_dump_dot(gb, gf, "opt-backward.dot");
-    }
-
-    free(gb);
-    free(gf);
-
-    if (free_ctx) {
-        ggml_free(ctx);
-    }
-
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_0 == 0);
-    const int nb = k / QK4_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK4_0;
-
-        quantize_row_q4_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_0; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_0*sizeof(block_q4_0));
-}
-
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_1 == 0);
-    const int nb = k / QK4_1;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK4_1;
-
-        quantize_row_q4_1_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_1; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_1*sizeof(block_q4_1));
-}
-
-size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK4_2 == 0);
-    const int nb = k / QK4_2;
-
-    for (int j = 0; j < n; j += k) {
-        block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
-
-        quantize_row_q4_2_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK4_2; l += 2) {
-                const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
-                const uint8_t vi1 = y[i].qs[l/2] >> 4;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK4_2*sizeof(block_q4_2));
-}
-
-size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_0 == 0);
-    const int nb = k / QK5_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_q5_0 * restrict y = (block_q5_0 *)dst + j/QK5_0;
-
-        quantize_row_q5_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int l = 0; l < QK5_0; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_0*sizeof(block_q5_0));
-}
-
-size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK5_1 == 0);
-    const int nb = k / QK5_1;
-
-    for (int j = 0; j < n; j += k) {
-        block_q5_1 * restrict y = (block_q5_1 *)dst + j/QK5_1;
-
-        quantize_row_q5_1_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            uint32_t qh;
-            memcpy(&qh, &y[i].qh, sizeof(qh));
-
-            for (int l = 0; l < QK5_1; l += 2) {
-                const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
-                const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
-
-                // cast to 16 bins
-                const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
-                const uint8_t vi1 = ((y[i].qs[l/2] >>   4) | vh1) / 2;
-
-                hist[vi0]++;
-                hist[vi1]++;
-            }
-        }
-    }
-
-    return (n/QK5_1*sizeof(block_q5_1));
-}
-
-size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int j = 0; j < n; j += k) {
-        block_q8_0 * restrict y = (block_q8_0 *)dst + j/QK8_0;
-
-        quantize_row_q8_0_reference(src + j, y, k);
-
-        for (int i = 0; i < nb; i++) {
-            for (int l = 0; l < QK8_0; ++l) {
-                const int8_t vi = y[i].qs[l];
-
-                hist[vi/16 + 8]++;
-            }
-        }
-    }
-
-    return (n/QK8_0*sizeof(block_q8_0));
-}
-
-size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
-    size_t result = 0;
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            {
-                GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_1:
-            {
-                GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q4_2:
-            {
-                GGML_ASSERT(start % QK4_2 == 0);
-                block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
-                result = ggml_quantize_q4_2(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_0:
-            {
-                GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q5_1:
-            {
-                GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
-            } break;
-        case GGML_TYPE_Q8_0:
-            {
-                GGML_ASSERT(start % QK8_0 == 0);
-                block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
-                result = ggml_quantize_q8_0(src + start, block, n, n, hist);
-            } break;
-        default:
-            assert(false);
-    }
-    return result;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-int ggml_cpu_has_avx(void) {
-#if defined(__AVX__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx2(void) {
-#if defined(__AVX2__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512(void) {
-#if defined(__AVX512F__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vbmi(void) {
-#if defined(__AVX512VBMI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_avx512_vnni(void) {
-#if defined(__AVX512VNNI__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fma(void) {
-#if defined(__FMA__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_neon(void) {
-#if defined(__ARM_NEON)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_arm_fma(void) {
-#if defined(__ARM_FEATURE_FMA)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_f16c(void) {
-#if defined(__F16C__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_fp16_va(void) {
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_wasm_simd(void) {
-#if defined(__wasm_simd128__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_cublas(void) {
-#if defined(GGML_USE_CUBLAS)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_clblast(void) {
-#if defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
-}
-
-int ggml_cpu_has_sse3(void) {
-#if defined(__SSE3__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-int ggml_cpu_has_vsx(void) {
-#if defined(__POWER9_VECTOR__)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/radpajama/ggml.h b/third_party/radpajama/ggml.h
deleted file mode 100644
index 20b661383..000000000
--- a/third_party/radpajama/ggml.h
+++ /dev/null
@@ -1,931 +0,0 @@
-#ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
-#define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
-#if !(__ASSEMBLER__ + __LINKER__ + 0)
-COSMOPOLITAN_C_START_
-// clang-format off
-
-//
-// GGML Tensor Library
-//
-// This documentation is still a work in progress.
-// If you wish some specific topics to be covered, feel free to drop a comment:
-//
-//   https://github.com/ggerganov/whisper.cpp/issues/40
-//
-// ## Overview
-//
-// This library implements:
-//
-//  - a set of tensor operations
-//  - automatic differentiation
-//  - basic optimization algorithms
-//
-// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
-// but is not limited to, the following:
-//
-//  - linear regression
-//  - support vector machines
-//  - neural networks
-//
-// The library allows the user to define a certain function using the available tensor operations. This function
-// definition is represented internally via a computation graph. Each tensor operation in the function definition
-// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-// using one of the available optimization algorithms.
-//
-// For example, here we define the function: f(x) = a*x^2 + b
-//
-//   {
-//       struct ggml_init_params params = {
-//           .mem_size   = 16*1024*1024,
-//           .mem_buffer = NULL,
-//       };
-//
-//       // memory allocation happens here
-//       struct ggml_context * ctx = ggml_init(params);
-//
-//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//
-//       ggml_set_param(ctx, x); // x is an input variable
-//
-//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
-//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
-//
-//       ...
-//   }
-//
-// Notice that the function definition above does not involve any actual computation. The computation is performed only
-// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
-//
-//   {
-//       ...
-//
-//       struct ggml_cgraph gf = ggml_build_forward(f);
-//
-//       // set the input variable and parameter values
-//       ggml_set_f32(x, 2.0f);
-//       ggml_set_f32(a, 3.0f);
-//       ggml_set_f32(b, 4.0f);
-//
-//       ggml_graph_compute(ctx0, &gf);
-//
-//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
-//
-//       ...
-//   }
-//
-// The actual computation is performed in the ggml_graph_compute() function.
-//
-// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
-// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
-// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
-// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
-// actually needed.
-//
-// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
-// differentiation and optimization algorithms.
-//
-// The described approach allows to define the function graph once and then compute its forward or backward graphs
-// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
-// the user can avoid the memory allocation overhead at runtime.
-//
-// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
-// citizens, but in theory the library can be extended to support FP8 and integer data types.
-//
-// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
-// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
-// clear that the library needs to support more complex operations. The way to support these operations is not clear
-// yet, but a few examples are demonstrated in the following operations:
-//
-//   - ggml_permute()
-//   - ggml_conv_1d_1s()
-//   - ggml_conv_1d_2s()
-//
-// For each tensor operator, the library implements a forward and backward computation function. The forward function
-// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
-// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
-// calculus class, or watch the following video:
-//
-//   What is Automatic Differentiation?
-//   https://www.youtube.com/watch?v=wG_nF1awSSY
-//
-//
-// ## Tensor data (struct ggml_tensor)
-//
-// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
-// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
-// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
-//
-//   {
-//       struct ggml_tensor * c = ggml_add(ctx, a, b);
-//
-//       assert(c->src[0] == a);
-//       assert(c->src[1] == b);
-//   }
-//
-// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
-// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
-// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
-// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
-// contiguous in memory.
-//
-// The data of the tensor is accessed via the "data" pointer. For example:
-//
-//   {
-//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-//
-//       // a[1, 2] = 1.0f;
-//       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
-//
-//       // a[2, 0] = 2.0f;
-//       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
-//
-//       ...
-//   }
-//
-// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
-//
-// ## The matrix multiplication operator (ggml_mul_mat)
-//
-// TODO
-//
-//
-// ## Multi-threading
-//
-// TODO
-//
-//
-// ## Overview of ggml.c
-//
-// TODO
-//
-//
-// ## SIMD optimizations
-//
-// TODO
-//
-//
-// ## Debugging ggml
-//
-// TODO
-//
-//
-
-#ifdef GGML_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
-#        else
-#            define GGML_API __declspec(dllimport)
-#        endif
-#    else
-#        define GGML_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define GGML_API
-#endif
-
-#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
-
-#define GGML_MAX_DIMS          4
-#define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
-#define GGML_MAX_CONTEXTS      64
-#define GGML_MAX_OPT           4
-#define GGML_DEFAULT_N_THREADS 4
-
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
-    typedef __fp16 ggml_fp16_t;
-#else
-    typedef uint16_t ggml_fp16_t;
-#endif
-
-    // convert FP16 <-> FP32
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
-
-    struct ggml_object;
-    struct ggml_context;
-
-    enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
-        GGML_TYPE_Q4_2 = 4,
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
-        GGML_TYPE_COUNT,
-    };
-
-    // model file types
-    enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-    };
-
-    // available tensor operations:
-    enum ggml_op {
-        GGML_OP_NONE = 0,
-
-        GGML_OP_DUP,
-        GGML_OP_ADD,
-        GGML_OP_SUB,
-        GGML_OP_MUL,
-        GGML_OP_DIV,
-        GGML_OP_SQR,
-        GGML_OP_SQRT,
-        GGML_OP_SUM,
-        GGML_OP_MEAN,
-        GGML_OP_REPEAT,
-        GGML_OP_ABS,
-        GGML_OP_SGN,
-        GGML_OP_NEG,
-        GGML_OP_STEP,
-        GGML_OP_RELU,
-        GGML_OP_GELU,
-        GGML_OP_SILU,
-        GGML_OP_NORM, // normalize
-        GGML_OP_RMS_NORM,
-
-        GGML_OP_MUL_MAT,
-
-        GGML_OP_SCALE,
-        GGML_OP_CPY,
-        GGML_OP_CONT,
-        GGML_OP_RESHAPE,
-        GGML_OP_VIEW,
-        GGML_OP_PERMUTE,
-        GGML_OP_TRANSPOSE,
-        GGML_OP_GET_ROWS,
-        GGML_OP_DIAG_MASK_INF,
-        GGML_OP_SOFT_MAX,
-        GGML_OP_ROPE,
-        GGML_OP_ALIBI,
-        GGML_OP_CONV_1D_1S,
-        GGML_OP_CONV_1D_2S,
-
-        GGML_OP_FLASH_ATTN,
-        GGML_OP_FLASH_FF,
-
-        GGML_OP_MAP_UNARY,
-        GGML_OP_MAP_BINARY,
-
-        GGML_OP_COUNT,
-    };
-
-
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        char padding[8];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
-    // n-dimensional tensor
-    struct ggml_tensor {
-        enum ggml_type type;
-
-        int     n_dims;
-        int64_t ne[GGML_MAX_DIMS]; // number of elements
-        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = sizeof(type)
-                                   // nb[1] = nb[0]   * ne[0] + padding
-                                   // nb[i] = nb[i-1] * ne[i-1]
-
-        // compute data
-        enum ggml_op op;
-
-        bool is_param;
-
-        struct ggml_tensor * grad;
-        struct ggml_tensor * src0;
-        struct ggml_tensor * src1;
-        struct ggml_tensor * opt[GGML_MAX_OPT];
-
-        // thread scheduling
-        int n_tasks;
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-
-        void * data;
-
-        char name[32];
-
-        char padding[8]; // TODO: remove and add padding to name?
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int n_nodes;
-        int n_leafs;
-        int n_threads;
-
-        size_t work_size;
-        struct ggml_tensor * work;
-
-        struct ggml_tensor * nodes[GGML_MAX_NODES];
-        struct ggml_tensor * grads[GGML_MAX_NODES];
-        struct ggml_tensor * leafs[GGML_MAX_NODES];
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-
-    // scratch buffer
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-
-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
-    // misc
-
-    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
-    GGML_API int64_t ggml_time_ms(void);
-    GGML_API int64_t ggml_time_us(void);
-    GGML_API int64_t ggml_cycles(void);
-    GGML_API int64_t ggml_cycles_per_ms(void);
-
-    GGML_API void    ggml_print_object (const struct ggml_object * obj);
-    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-
-    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
-
-    GGML_API int     ggml_blck_size (enum ggml_type type);
-    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-
-    GGML_API const char * ggml_type_name(enum ggml_type type);
-
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
-
-    // TODO: temporary until model loading of ggml examples is refactored
-    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-
-    // main
-
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void    ggml_free(struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int    n_dims,
-            const int64_t *ne);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2,
-            int64_t ne3);
-
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
-
-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
-    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-
-    GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
-    GGML_API void         ggml_set_name(struct ggml_tensor * tensor, const char * name);
-
-    //
-    // operations on tensors with backpropagation
-    //
-
-    GGML_API struct ggml_tensor * ggml_dup(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_add(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sub(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_mul(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_div(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sqr(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqrt(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // return scalar
-    // TODO: compute sum along rows
-    GGML_API struct ggml_tensor * ggml_sum(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // mean along rows
-    GGML_API struct ggml_tensor * ggml_mean(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // if a is the same shape as b, and a is not parameter, return a
-    // otherwise, return a new tensor: repeat(a) to fit in b
-    GGML_API struct ggml_tensor * ggml_repeat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_abs(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sgn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_neg(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_step(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_relu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // TODO: double-check this computation is correct
-    GGML_API struct ggml_tensor * ggml_gelu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_silu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // normalize along rows
-    // TODO: eps is hardcoded to 1e-5 for now
-    GGML_API struct ggml_tensor * ggml_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_rms_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // A: m rows, n columns
-    // B: p rows, n columns (i.e. we transpose it internally)
-    // result is m columns, p rows
-    GGML_API struct ggml_tensor * ggml_mul_mat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    //
-    // operations on tensors without backpropagation
-    //
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_scale(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // a -> b, return view(b)
-    GGML_API struct ggml_tensor * ggml_cpy(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // make contiguous
-    GGML_API struct ggml_tensor * ggml_cont(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // return view(a), b specifies the new shape
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2);
-
-    // offset in bytes
-    GGML_API struct ggml_tensor * ggml_view_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            size_t                nb1, // row stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            size_t                nb1, // row   stride in bytes
-            size_t                nb2, // slice stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_permute(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   axis0,
-            int                   axis1,
-            int                   axis2,
-            int                   axis3);
-
-    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-    GGML_API struct ggml_tensor * ggml_transpose(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_get_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // set elements above the diagonal to -INF
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // rotary position embedding
-    // in-place, returns view(a)
-    // if mode & 1 == 1, skip n_past elements
-    // if mode & 2 == 1, GPT-NeoX style
-    // TODO: avoid creating a new tensor every time
-    GGML_API struct ggml_tensor * ggml_rope(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_dims,
-            int                   mode);
-
-    // alibi position embedding
-    // in-place, returns view(a)
-    struct ggml_tensor * ggml_alibi(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head);
-
-    // padding = 1
-    // TODO: we don't support extra parameters for now
-    //       that's why we are hard-coding the stride, padding, and dilation
-    //       not great ..
-    GGML_API struct ggml_tensor * ggml_conv_1d_1s(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_conv_1d_2s(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_flash_attn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            bool                  masked);
-
-    GGML_API struct ggml_tensor * ggml_flash_ff(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b0,
-            struct ggml_tensor  * b1,
-            struct ggml_tensor  * c0,
-            struct ggml_tensor  * c1);
-
-    // Mapping operations
-    typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    GGML_API struct ggml_tensor * ggml_map_unary_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-            const  ggml_unary_op_f32_t fun);
-
-    GGML_API struct ggml_tensor * ggml_map_binary_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-            const  ggml_binary_op_f32_t fun);
-
-    //
-    // automatic differentiation
-    //
-
-    GGML_API void ggml_set_param(
-            struct ggml_context * ctx,
-            struct ggml_tensor * tensor);
-
-    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-
-    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
-
-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
-
-    // print info and performance information for the graph
-    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
-
-    // dump the graph into a file using the dot format
-    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-
-    //
-    // optimization
-    //
-
-    // optimization methods
-    enum ggml_opt_type {
-        GGML_OPT_ADAM,
-        GGML_OPT_LBFGS,
-    };
-
-    // linesearch methods
-    enum ggml_linesearch {
-        GGML_LINESEARCH_DEFAULT = 1,
-
-        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-
-    // optimization return values
-    enum ggml_opt_result {
-        GGML_OPT_OK = 0,
-        GGML_OPT_DID_NOT_CONVERGE,
-        GGML_OPT_NO_CONTEXT,
-        GGML_OPT_INVALID_WOLFE,
-        GGML_OPT_FAIL,
-
-        GGML_LINESEARCH_FAIL = -128,
-        GGML_LINESEARCH_MINIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-
-    // optimization parameters
-    //
-    //   see ggml.c (ggml_opt_default_params) for default values
-    //
-    struct ggml_opt_params {
-        enum ggml_opt_type type;
-
-        int n_threads;
-
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-
-        bool print_forward_graph;
-        bool print_backward_graph;
-
-        // ADAM parameters
-        struct {
-            int n_iter;
-
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-        } adam;
-
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-
-            enum ggml_linesearch linesearch;
-        } lbfgs;
-    };
-
-    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-    // optimize the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt(
-            struct ggml_context * ctx,
-            struct ggml_opt_params params,
-            struct ggml_tensor * f);
-
-    //
-    // quantization
-    //
-
-    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
-
-    //
-    // system info
-    //
-
-    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx2       (void);
-    GGML_API int ggml_cpu_has_avx512     (void);
-    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_f16c       (void);
-    GGML_API int ggml_cpu_has_fp16_va    (void);
-    GGML_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
-    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
-    GGML_API int ggml_cpu_has_sse3       (void);
-    GGML_API int ggml_cpu_has_vsx        (void);
-
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-
-#ifdef  __cplusplus
-    // restrict not standard in C++
-#define GGML_RESTRICT
-#else
-#define GGML_RESTRICT restrict
-#endif
-    typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-    typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
-
-    typedef struct {
-        dequantize_row_q_t dequantize_row_q;
-        quantize_row_q_t   quantize_row_q;
-        quantize_row_q_t   quantize_row_q_reference;
-        quantize_row_q_t   quantize_row_q_dot;
-        vec_dot_q_t        vec_dot_q;
-        enum ggml_type     vec_dot_type;
-    } quantize_fns_t;
-
-    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
-
-COSMOPOLITAN_C_END_
-#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
-#endif /* COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ */
diff --git a/third_party/radpajama/ggml.mk b/third_party/radpajama/ggml.mk
deleted file mode 100644
index 81c0a98c4..000000000
--- a/third_party/radpajama/ggml.mk
+++ /dev/null
@@ -1,141 +0,0 @@
-#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
-#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
-
-PKGS += THIRD_PARTY_GGML
-
-################################################################################
-# single file machine learning framework written in c
-# make -j8 o//third_party/ggml/ggml.a
-
-THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A
-THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A)
-THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a
-THIRD_PARTY_GGML_A_HDRS = third_party/ggml/ggml.h
-THIRD_PARTY_GGML_A_SRCS = third_party/ggml/ggml.c
-THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o)
-THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS)
-THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok)
-
-THIRD_PARTY_GGML_A_DIRECTDEPS =						\
-	LIBC_CALLS							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_THREAD							\
-	LIBC_STR							\
-	LIBC_STUBS							\
-	LIBC_SYSV							\
-	LIBC_TINYMATH							\
-	THIRD_PARTY_COMPILER_RT
-
-THIRD_PARTY_GGML_A_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x))))
-
-$(THIRD_PARTY_GGML_A):							\
-		third_party/ggml/					\
-		$(THIRD_PARTY_GGML_A).pkg				\
-		$(THIRD_PARTY_GGML_A_OBJS)
-
-$(THIRD_PARTY_GGML_A).pkg:						\
-		$(THIRD_PARTY_GGML_A_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)_A).pkg)
-
-$(THIRD_PARTY_GGML_A_OBJS): private					\
-		OVERRIDE_CFLAGS +=					\
-			-O3						\
-			-ffunction-sections				\
-			-fdata-sections
-
-ifeq ($(ARCH), x86_64)
-$(THIRD_PARTY_GGML_A_OBJS): private					\
-		OVERRIDE_CFLAGS +=					\
-			-msse3						\
-			-mavx						\
-			-mavx2						\
-			-mf16c						\
-			-mfma
-endif
-
-o/rel/third_party/ggml/ggml.o						\
-o/opt/third_party/ggml/ggml.o: private					\
-		OVERRIDE_CFLAGS +=					\
-			-fomit-frame-pointer				\
-			-x-no-pg
-
-ifeq ($(ARCH), x86_64)
-o/rel/third_party/ggml/ggml.o						\
-o/opt/third_party/ggml/ggml.o: private					\
-		OVERRIDE_CFLAGS +=					\
-			-fschedule-insns2				\
-			-mred-zone
-endif
-
-################################################################################
-# command for running inference on large language models
-# make -j8 o//third_party/ggml/llama.com
-
-THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_LLAMA
-THIRD_PARTY_GGML_LLAMA = o/$(MODE)/third_party/ggml/llama.com
-THIRD_PARTY_GGML_LLAMA_HDRS = third_party/ggml/llama.h third_party/ggml/llama_util.h third_party/ggml/common.h
-THIRD_PARTY_GGML_LLAMA_SRCS = third_party/ggml/main.cc third_party/ggml/llama.cc third_party/ggml/common.cc
-THIRD_PARTY_GGML_LLAMA_OBJS = $(THIRD_PARTY_GGML_LLAMA_SRCS:%.cc=o/$(MODE)/%.o)
-THIRD_PARTY_GGML_LLAMA_FILES := $(THIRD_PARTY_GGML_LLAMA_SRCS) $(THIRD_PARTY_GGML_LLAMA_HDRS)
-THIRD_PARTY_GGML_LLAMA_CHECKS = $(THIRD_PARTY_GGML_LLAMA).pkg $(THIRD_PARTY_GGML_LLAMA_HDRS:%=o/$(MODE)/%.okk)
-
-THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =					\
-	LIBC_CALLS							\
-	LIBC_FMT							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_LOG							\
-	LIBC_STR							\
-	LIBC_STUBS							\
-	LIBC_SYSV							\
-	LIBC_THREAD							\
-	LIBC_TINYMATH							\
-	LIBC_ZIPOS							\
-	THIRD_PARTY_GGML						\
-	THIRD_PARTY_LIBCXX
-
-THIRD_PARTY_GGML_LLAMA_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x))))
-
-$(THIRD_PARTY_GGML_LLAMA).dbg:						\
-		$(THIRD_PARTY_GGML_LLAMA).pkg				\
-		$(THIRD_PARTY_GGML_LLAMA_DEPS)				\
-		o/$(MODE)/third_party/ggml/companionai.txt.zip.o	\
-		o/$(MODE)/third_party/ggml/common.o			\
-		o/$(MODE)/third_party/ggml/llama.o			\
-		o/$(MODE)/third_party/ggml/main.o			\
-		$(CRT)							\
-		$(APE_NO_MODIFY_SELF)
-	@$(APELINK)
-
-$(THIRD_PARTY_GGML_LLAMA).pkg:						\
-		$(THIRD_PARTY_GGML_LLAMA_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/third_party/ggml/companionai.txt.zip.o: private		\
-		ZIPOBJ_FLAGS +=						\
-			-B
-
-################################################################################
-
-THIRD_PARTY_GGML_COMS = $(THIRD_PARTY_GGML_LLAMA)
-THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
-THIRD_PARTY_GGML_LIBS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)))
-THIRD_PARTY_GGML_SRCS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_SRCS))
-THIRD_PARTY_GGML_HDRS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_HDRS))
-THIRD_PARTY_GGML_OBJS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_OBJS))
-THIRD_PARTY_GGML_CHECKS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_CHECKS))
-$(THIRD_PARTY_GGML_OBJS): third_party/ggml/ggml.mk
-
-.PHONY: o/$(MODE)/third_party/ggml
-o/$(MODE)/third_party/ggml:						\
-		$(THIRD_PARTY_GGML_BINS)				\
-		$(THIRD_PARTY_GGML_CHECKS)
diff --git a/third_party/radpajama/gptneox-util.h b/third_party/radpajama/gptneox-util.h
index 52d7bf80a..d372d37d9 100644
--- a/third_party/radpajama/gptneox-util.h
+++ b/third_party/radpajama/gptneox-util.h
@@ -1,41 +1,48 @@
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
 #ifndef GPTNEOX_UTIL_H
 #define GPTNEOX_UTIL_H
+#include "libc/calls/calls.h"
+#include "libc/calls/struct/rlimit.h"
+#include "libc/calls/struct/rusage.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/errno.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/str/str.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/madv.h"
+#include "libc/sysv/consts/map.h"
+#include "libc/sysv/consts/mfd.h"
+#include "libc/sysv/consts/mlock.h"
+#include "libc/sysv/consts/mremap.h"
+#include "libc/sysv/consts/msync.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/sysv/consts/posix.h"
+#include "libc/sysv/consts/prio.h"
+#include "libc/sysv/consts/prot.h"
+#include "libc/sysv/consts/rlim.h"
+#include "libc/sysv/consts/rlimit.h"
+#include "libc/sysv/consts/rusage.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/cerrno"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdarg"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstdlib"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/string"
+#include "third_party/libcxx/vector"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+// clang-format off
 
-#include <cstdio>
-#include <cstdint>
-#include <cerrno>
-#include <cstring>
-#include <cstdarg>
-#include <cstdlib>
-#include <climits>
-
-#include <string>
-#include <vector>
-
-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/mman.h>
-        #endif
-        #if defined(_POSIX_MEMLOCK_RANGE)
-            #include <sys/resource.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <io.h>
-    #include <stdio.h> // for _fseeki64
-#endif
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
 
 #define GPTNEOX_ASSERT(x) \
     do { \
@@ -74,7 +81,7 @@ struct gptneox_file {
     gptneox_file(const char * fname, const char * mode) {
         fp = std::fopen(fname, mode);
         if (fp == NULL) {
-            throw format("failed to open %s: %s", fname, std::strerror(errno));
+            Die("failed to open %s: %s", fname, std::strerror(errno));
         }
         seek(0, SEEK_END);
         size = tell();
@@ -107,10 +114,10 @@ struct gptneox_file {
         errno = 0;
         std::size_t ret = std::fread(ptr, size, 1, fp);
         if (ferror(fp)) {
-            throw format("read error: %s", strerror(errno));
+            Die("read error: %s", strerror(errno));
         }
         if (ret != 1) {
-            throw std::string("unexpectedly reached end of file");
+            Die("unexpectedly reached end of file");
         }
     }
 
@@ -133,7 +140,7 @@ struct gptneox_file {
         errno = 0;
         size_t ret = std::fwrite(ptr, size, 1, fp);
         if (ret != 1) {
-            throw format("write error: %s", strerror(errno));
+            Die("write error: %s", strerror(errno));
         }
     }
 
@@ -180,7 +187,7 @@ struct gptneox_mmap {
 #endif
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {
-            throw format("mmap failed: %s", strerror(errno));
+            Die("mmap failed: %s", strerror(errno));
         }
 
         if (prefetch) {
@@ -207,7 +214,7 @@ struct gptneox_mmap {
         DWORD error = GetLastError();
 
         if (hMapping == NULL) {
-            throw format("CreateFileMappingA failed: %s", gptneox_format_win_err(error).c_str());
+            Die("CreateFileMappingA failed: %s", gptneox_format_win_err(error).c_str());
         }
 
         addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +222,7 @@ struct gptneox_mmap {
         CloseHandle(hMapping);
 
         if (addr == NULL) {
-            throw format("MapViewOfFile failed: %s", gptneox_format_win_err(error).c_str());
+            Die("MapViewOfFile failed: %s", gptneox_format_win_err(error).c_str());
         }
 
         #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -244,7 +251,7 @@ struct gptneox_mmap {
     static constexpr bool SUPPORTED = false;
 
     gptneox_mmap(struct gptneox_file *) {
-        throw std::string("mmap not supported");
+        Die("mmap not supported");
     }
 #endif
 };
@@ -407,7 +414,7 @@ struct gptneox_buffer {
 };
 
 #ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
+// MISSING #include "ggml-cuda.h"
 struct gptneox_ctx_buffer {
     uint8_t * addr = NULL;
     size_t size = 0;
diff --git a/third_party/radpajama/gptneox.cpp b/third_party/radpajama/gptneox.cc
similarity index 94%
rename from third_party/radpajama/gptneox.cpp
rename to third_party/radpajama/gptneox.cc
index f7fcfc292..06d8bf352 100644
--- a/third_party/radpajama/gptneox.cpp
+++ b/third_party/radpajama/gptneox.cc
@@ -1,33 +1,59 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/radpajama/gptneox.h"
+#include "libc/intrin/bits.h"
+#include "third_party/ggml/fp16.h"
+#include "third_party/ggml/ggml.h"
+#include "third_party/ggml/llama_util.h"
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/array"
+#include "third_party/libcxx/atomic"
+#include "third_party/libcxx/cassert"
+#include "third_party/libcxx/cinttypes"
+#include "third_party/libcxx/climits"
+#include "third_party/libcxx/cstdint"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/ctime"
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/initializer_list"
+#include "third_party/libcxx/map"
+#include "third_party/libcxx/memory"
+#include "third_party/libcxx/mutex"
+#include "third_party/libcxx/queue"
+#include "third_party/libcxx/random"
+#include "third_party/libcxx/sstream"
+#include "third_party/libcxx/thread"
+#include "third_party/libcxx/unordered_map"
+#include "third_party/radpajama/gptneox-util.h"
+// clang-format off
 // Defines fileno on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#include <cstdint>
-#include <cstdio>
-#endif
-
-#include "gptneox-util.h"
-#include "gptneox.h"
-
-#include "../ggml.h"
-
-#include <array>
-#include <ctime>
-#include <cinttypes>
-#include <fstream>
-#include <random>
-#include <map>
-#include <unordered_map>
-#include <queue>
-#include <cassert>
-#include <cstring>
-#include <climits>
-#include <memory>
-#include <algorithm>
-#include <initializer_list>
-#include <thread>
-#include <atomic>
-#include <mutex>
-#include <sstream>
 
 // TODO: Add back in n_ctx (max_position_embeddings) to ggml model, it is currently hard-coded to 2048 max for llama
 
@@ -289,7 +315,7 @@ template <typename T>
 static T checked_mul(T a, T b) {
     T ret = a * b;
     if (a != 0 && ret / a != b) {
-        throw format("overflow multiplying %llu * %llu",
+        Die("overflow multiplying %llu * %llu",
                      (unsigned long long) a, (unsigned long long) b);
     }
     return ret;
@@ -297,7 +323,7 @@ static T checked_mul(T a, T b) {
 
 static size_t checked_div(size_t a, size_t b) {
     if (b == 0 || a % b != 0) {
-        throw format("error dividing %zu / %zu", a, b);
+        Die("error dividing %zu / %zu", a, b);
     }
     return a / b;
 }
@@ -361,7 +387,7 @@ struct gptneox_load_tensor {
         const auto & first_shard = shards.at(0);
         for (const auto & shard : shards) {
             if (shard.type != first_shard.type) {
-                throw format("inconsistent tensor shard type in '%s'", name.c_str());
+                Die("inconsistent tensor shard type in '%s'", name.c_str());
             }
         }
         type = first_shard.type;
@@ -384,7 +410,7 @@ struct gptneox_load_tensor {
         const auto & first_shard = shards.at(0);
         for (const auto & shard : shards) {
             if (shard.ne != first_shard.ne) {
-                throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
+                Die("inconsistent tensor shard shape in '%s': first was %s, other was %s",
                              name.c_str(), gptneox_format_tensor_shape(first_shard.ne).c_str(), gptneox_format_tensor_shape(shard.ne).c_str());
             }
         }
@@ -441,18 +467,18 @@ struct gptneox_file_loader {
         uint32_t magic = file.read_u32();
         uint32_t version = 0;
 
-        if (magic != 'ggml') {
+        if (magic != READ32BE("ggml")) {
             version = file.read_u32();
         }
 
-        if (magic == 'ggml' && version == 0) {
+        if (magic == READ32BE("ggml") && version == 0) {
             file_version = GPTNEOX_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
+        } else if (magic == READ32BE("ggmf") && version == 1) {
             file_version = GPTNEOX_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
+        } else if (magic == READ32BE("ggjt") && version == 1) {
             file_version = GPTNEOX_FILE_VERSION_GGJT_V1;
         } else {
-            throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+            Die("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
                          magic, version);
         }
     }
@@ -496,7 +522,7 @@ struct gptneox_file_loader {
             file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
             std::string name = file.read_string(name_len);
             if (n_dims < 1 || n_dims > 2) {
-                throw format("gptneox.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
+                Die("gptneox.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
             }
             switch (shard.type) {
                 case GGML_TYPE_F32:
@@ -509,7 +535,7 @@ struct gptneox_file_loader {
                 case GGML_TYPE_Q8_0:
                     break;
                 default: {
-                    throw format("unrecognized tensor type %u\n", shard.type);
+                    Die("unrecognized tensor type %u\n", shard.type);
                 }
             }
 
@@ -543,12 +569,13 @@ struct gptneox_file_saver {
     gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype)
         : file(fname, "wb"), any_file_loader(any_file_loader) {
         fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname);
+        ggjt_v1();
         write_magic();
         write_hparams(new_ftype);
         write_vocab();
     }
     void write_magic() {
-        file.write_u32('ggjt'); // magic
+        file.write_u32(READ32BE("ggjt")); // magic
         file.write_u32(1); // version
     }
     void write_hparams(enum gptneox_ftype new_ftype) {
@@ -616,7 +643,7 @@ struct gptneox_model_loader {
             auto ith_file = new gptneox_file_loader(fname.c_str(), i, tensors_map);
             file_loaders.emplace_back(ith_file);
             if (ith_file->hparams != first_file->hparams) {
-                throw format("gptneox.cpp: hparams inconsistent between files");
+                Die("gptneox.cpp: hparams inconsistent between files");
             }
         }
         if (!gptneox_mmap::SUPPORTED) {
@@ -646,7 +673,7 @@ struct gptneox_model_loader {
     uint32_t guess_n_parts() const {
         auto it = tensors_map.name_to_idx.find("gpt_neox.embed_in.weight");
         if (it == tensors_map.name_to_idx.end()) {
-            throw std::string("missing gpt_neox.embed_in.weight");
+            Die("missing gpt_neox.embed_in.weight");
         }
         const gptneox_load_tensor & lt = tensors_map.tensors.at(it->second);
         return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
@@ -663,11 +690,11 @@ struct gptneox_model_loader {
     struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
         auto it = tensors_map.name_to_idx.find(name);
         if (it == tensors_map.name_to_idx.end()) {
-            throw format("gptneox.cpp: tensor '%s' is missing from model", name.c_str());
+            Die("gptneox.cpp: tensor '%s' is missing from model", name.c_str());
         }
         gptneox_load_tensor & lt = tensors_map.tensors.at(it->second);
         if (lt.ne != ne) {
-            throw format("gptneox.cpp: tensor '%s' has wrong shape; expected %s, got %s",
+            Die("gptneox.cpp: tensor '%s' has wrong shape; expected %s, got %s",
                          name.c_str(), gptneox_format_tensor_shape(ne).c_str(), gptneox_format_tensor_shape(lt.ne).c_str());
         }
 
@@ -690,7 +717,7 @@ struct gptneox_model_loader {
 
     void done_getting_tensors() {
         if (num_ggml_tensors_created != tensors_map.tensors.size()) {
-            throw std::string("gptneox.cpp: file contained more tensors than expected");
+            Die("gptneox.cpp: file contained more tensors than expected");
         }
     }
 
@@ -1003,7 +1030,7 @@ static void gptneox_model_load_internal(
 
         model.ctx = ggml_init(params);
         if (!model.ctx) {
-            throw format("ggml_init() failed");
+            Die("ggml_init() failed");
         }
     }
 
@@ -1072,14 +1099,14 @@ static bool gptneox_model_load(
         bool vocab_only,
         gptneox_progress_callback progress_callback,
         void *progress_callback_user_data) {
-    try {
+    // try {
         gptneox_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
                                   vocab_only, progress_callback, progress_callback_user_data);
         return true;
-    } catch (const std::string & err) {
-        fprintf(stderr, "error loading model: %s\n", err.c_str());
-        return false;
-    }
+    // } catch (const std::string & err) {
+    //     fprintf(stderr, "error loading model: %s\n", err.c_str());
+    //     return false;
+    // }
 }
 
 // evaluate the transformer
@@ -2053,13 +2080,13 @@ int gptneox_model_copy(
         const char * fname_inp,
         const char * fname_out,
   enum gptneox_ftype   ftype) {
-    try {
+    // try {
         gptneox_model_copy_internal(fname_inp, fname_out, ftype);
         return 0;
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to copy: %s\n", __func__, err.c_str());
-        return 1;
-    }
+    // } catch (const std::string & err) {
+    //     fprintf(stderr, "%s: failed to copy: %s\n", __func__, err.c_str());
+    //     return 1;
+    // }
 }
 
 
@@ -2072,7 +2099,7 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
         case GPTNEOX_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
         case GPTNEOX_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
         case GPTNEOX_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
-        default: throw format("invalid output file type %d\n", ftype);
+        default: Die("invalid output file type %d\n", ftype);
     };
 
     if (nthread <= 0) {
@@ -2138,7 +2165,7 @@ static void gptneox_model_quantize_internal(const std::string & fname_inp, const
                     f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                 }
             } else {
-                throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
+                Die("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
             }
 
             printf("quantizing .. ");
@@ -2302,13 +2329,13 @@ int gptneox_model_quantize(
         const char * fname_out,
   enum gptneox_ftype   ftype,
         int          nthread) {
-    try {
+    // try {
         gptneox_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
         return 0;
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
-        return 1;
-    }
+    // } catch (const std::string & err) {
+    //     fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
+    //     return 1;
+    // }
 }
 
 int gptneox_apply_lora_from_file_internal(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
@@ -2328,7 +2355,7 @@ int gptneox_apply_lora_from_file_internal(struct gptneox_context * ctx, const ch
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
+        if (magic != READ32BE("ggla")) {
             fprintf(stderr, "%s: bad file magic\n", __func__);
             return 1;
         }
@@ -2551,12 +2578,12 @@ int gptneox_apply_lora_from_file_internal(struct gptneox_context * ctx, const ch
 }
 
 int gptneox_apply_lora_from_file(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
-    try {
+    // try {
         return gptneox_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
-        return 1;
-    }
+    // } catch (const std::string & err) {
+    //     fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
+    //     return 1;
+    // }
 }
 
 int gptneox_get_kv_cache_token_count(struct gptneox_context * ctx) {
@@ -2892,7 +2919,7 @@ size_t gptneox_load_session_file(struct gptneox_context * ctx, const char * path
     const uint32_t magic = file.read_u32();
     const uint32_t version = file.read_u32();
 
-    if (!(magic == 'ggsn' && version == 0)) {
+    if (!(magic == READ32BE("ggsn") && version == 0)) {
         fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
         return 0;
     }
@@ -2929,7 +2956,7 @@ size_t gptneox_save_session_file(struct gptneox_context * ctx, const char * path
     std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
     gptneox_copy_state_data(ctx, state_data.get());
 
-    file.write_u32('ggsn'); // magic
+    file.write_u32(READ32BE("ggsn")); // magic
     file.write_u32(0); // version
     file.write_raw(&ctx->model.hparams, sizeof(gptneox_hparams));
 
diff --git a/third_party/radpajama/gptneox.h b/third_party/radpajama/gptneox.h
index 1b1cfea25..4c0e83994 100644
--- a/third_party/radpajama/gptneox.h
+++ b/third_party/radpajama/gptneox.h
@@ -1,9 +1,6 @@
 #ifndef GPTNEOX_H
 #define GPTNEOX_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
+// clang-format off
 
 #ifdef GPTNEOX_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
@@ -264,8 +261,8 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef GPTNEOX_API_INTERNAL
 
-#include <vector>
-#include <string>
+#include "third_party/libcxx/vector"
+#include "third_party/libcxx/string"
 struct ggml_tensor;
 
 std::vector<std::pair<std::string, struct ggml_tensor *>>& gptneox_internal_get_tensor_map(struct gptneox_context * ctx);
diff --git a/third_party/radpajama/llama.h b/third_party/radpajama/llama.h
deleted file mode 100644
index 6b3c143d5..000000000
--- a/third_party/radpajama/llama.h
+++ /dev/null
@@ -1,256 +0,0 @@
-// -*- c++ -*-
-#ifndef LLAMA_H
-#define LLAMA_H
-#include "libc/intrin/bits.h"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-// clang-format off
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_VERSION           1
-#define LLAMA_FILE_MAGIC             READ32BE("ggjt")
-#define LLAMA_FILE_MAGIC_UNVERSIONED READ32BE("ggml")
-#define LLAMA_SESSION_MAGIC          READ32BE("ggsn")
-#define LLAMA_SESSION_VERSION        1
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
-    } llama_token_data;
-
-    typedef struct llama_token_data_array {
-        llama_token_data * data;
-        size_t size;
-        bool sorted;
-    } llama_token_data_array;
-
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, -1 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    // model file types
-    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    LLAMA_API bool llama_mmap_supported();
-    LLAMA_API bool llama_mlock_supported();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params,
-                                    int   verbose);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
-
-    // Apply a LoRA adapter to a loaded model
-    // path_base_model is the path to a higher quality model to use as a base for
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // will be applied on top of the previous one
-    // Returns 0 on success
-    LLAMA_API int llama_apply_lora_from_file(
-            struct llama_context * ctx,
-                      const char * path_lora,
-                      const char * path_base_model,
-                             int   n_threads);
-
-    // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Sets the current rng seed.
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-
-    // Returns the maximum size in bytes of the state (rng, logits, embedding
-    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
-
-    // Copies the state to the specified destination address.
-    // Destination needs to have allocated enough memory.
-    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
-
-    // Set the state reading from the specified address
-    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-    LLAMA_API llama_token llama_token_nl();
-
-    // Sampling functions
-
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
-
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-
-    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
-
-    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-
-    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
-
-    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
-
-    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
-
-    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
-
-    /// @details Selects the token with the highest probability.
-    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    /// @details Randomly selects a token from the candidates based on their probabilities.
-    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
-#ifdef LLAMA_API_INTERNAL
-
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif
-
-#endif // LLAMA_H
diff --git a/third_party/radpajama/llama_util.h b/third_party/radpajama/llama_util.h
deleted file mode 100755
index 05184945d..000000000
--- a/third_party/radpajama/llama_util.h
+++ /dev/null
@@ -1,389 +0,0 @@
-// Internal header to be included only by llama.cpp.
-// Contains wrappers around OS interfaces.
-
-#ifndef LLAMA_UTIL_H
-#define LLAMA_UTIL_H
-#include "libc/calls/struct/rlimit.h"
-#include "libc/dce.h"
-#include "libc/fmt/fmt.h"
-#include "libc/runtime/sysconf.h"
-#include "libc/sysv/consts/madv.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/rlimit.h"
-#include "third_party/libcxx/cerrno"
-#include "third_party/libcxx/climits"
-#include "third_party/libcxx/cstdarg"
-#include "third_party/libcxx/cstdint"
-#include "third_party/libcxx/cstdio"
-#include "third_party/libcxx/cstdlib"
-#include "third_party/libcxx/cstring"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-// clang-format off
-
-#define LLAMA_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((__format__(__gnu_printf__, 1, 2)))
-#else
-__attribute__((__format__(__printf__, 1, 2)))
-#endif
-__attribute__((__noreturn__))
-#endif
-static void Die(const char *fmt, ...) {
-    va_list va;
-    va_start(va, fmt);
-    vfprintf(stderr, fmt, va);
-    va_end(va);
-    fputc('\n', stderr);
-    exit(1);
-}
-
-struct llama_file {
-    // use FILE * so we don't have to re-open the file to mmap
-    FILE * fp;
-    size_t size;
-
-    llama_file(const char * fname, const char * mode) {
-        fp = std::fopen(fname, mode);
-        if (fp == NULL) {
-            Die("failed to open %s: %s", fname, std::strerror(errno));
-        }
-        seek(0, SEEK_END);
-        size = tell();
-        seek(0, SEEK_SET);
-    }
-
-    size_t tell() const {
-#ifdef _WIN32
-        __int64 ret = _ftelli64(fp);
-#else
-        long ret = std::ftell(fp);
-#endif
-        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
-        return (size_t) ret;
-    }
-
-    void seek(size_t offset, int whence) {
-#ifdef _WIN32
-        int ret = _fseeki64(fp, (__int64) offset, whence);
-#else
-        int ret = std::fseek(fp, (long) offset, whence);
-#endif
-        LLAMA_ASSERT(ret == 0); // same
-    }
-
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
-        if (ferror(fp)) {
-            Die("read error: %s", strerror(errno));
-        }
-        if (ret != 1) {
-            Die("unexpectedly reached end of file");
-        }
-    }
-
-    std::uint32_t read_u32() {
-        std::uint32_t ret;
-        read_raw(&ret, sizeof(ret));
-        return ret;
-    }
-
-    std::string read_string(std::uint32_t len) {
-        std::vector<char> chars(len);
-        read_raw(chars.data(), len);
-        return std::string(chars.data(), len);
-    }
-
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
-            return;
-        }
-        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
-        if (ret != 1) {
-            Die("write error: %s", strerror(errno));
-        }
-    }
-
-    void write_u32(std::uint32_t val) {
-        write_raw(&val, sizeof(val));
-    }
-
-    ~llama_file() {
-        if (fp) {
-            std::fclose(fp);
-        }
-    }
-};
-
-#if defined(_WIN32)
-static std::string llama_format_win_err(DWORD err) {
-    LPSTR buf;
-    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
-                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
-    if (!size) {
-        return "FormatMessageA failed";
-    }
-    std::string ret(buf, size);
-    LocalFree(buf);
-    return ret;
-}
-#endif
-
-struct llama_mmap {
-    void * addr;
-    size_t size;
-
-    llama_mmap(const llama_mmap &) = delete;
-
-#ifdef _POSIX_MAPPED_FILES
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
-        size = file->size;
-        int fd = fileno(file->fp);
-        int flags = MAP_SHARED;
-#ifdef __linux__
-        flags |= MAP_POPULATE;
-#endif
-        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        if (addr == MAP_FAILED) {
-            Die("mmap failed: %s", strerror(errno));
-        }
-
-        if (prefetch && !IsWindows()) {
-            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
-                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                        strerror(errno));
-            }
-        }
-    }
-
-    ~llama_mmap() {
-        munmap(addr, size);
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
-        size = file->size;
-
-        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
-
-        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
-        DWORD error = GetLastError();
-
-        if (hMapping == NULL) {
-            Die("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
-        }
-
-        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
-        error = GetLastError();
-        CloseHandle(hMapping);
-
-        if (addr == NULL) {
-            Die("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
-        }
-
-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
-        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
-            WIN32_MEMORY_RANGE_ENTRY range;
-            range.VirtualAddress = addr;
-            range.NumberOfBytes = (SIZE_T)size;
-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-            }
-        }
-        #else
-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
-    }
-
-    ~llama_mmap() {
-        if (!UnmapViewOfFile(addr)) {
-            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    llama_mmap(struct llama_file *) {
-        Die("mmap not supported");
-    }
-#endif
-};
-
-// Represents some region of memory being locked using mlock or VirtualLock;
-// will automatically unlock on destruction.
-struct llama_mlock {
-    void * addr = NULL;
-    size_t size = 0;
-    bool failed_already = false;
-
-    llama_mlock() {}
-    llama_mlock(const llama_mlock &) = delete;
-
-    ~llama_mlock() {
-        if (size) {
-            raw_unlock(addr, size);
-        }
-    }
-
-    void init(void * addr) {
-        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
-    }
-
-    void grow_to(size_t target_size) {
-        LLAMA_ASSERT(addr);
-        if (failed_already) {
-            return;
-        }
-        size_t granularity = lock_granularity();
-        target_size = (target_size + granularity - 1) & ~(granularity - 1);
-        if (target_size > size) {
-            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
-                size = target_size;
-            } else {
-                failed_already = true;
-            }
-        }
-    }
-
-#ifdef _POSIX_MEMLOCK_RANGE
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        return (size_t) sysconf(_SC_PAGESIZE);
-    }
-
-    #ifdef __APPLE__
-        #define MLOCK_SUGGESTION \
-            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
-            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
-    #else
-        #define MLOCK_SUGGESTION \
-            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
-    #endif
-
-    bool raw_lock(const void * addr, size_t size) {
-        if (!mlock(addr, size)) {
-            return true;
-        } else {
-            char* errmsg = std::strerror(errno);
-            bool suggest = (errno == ENOMEM);
-
-            // Check if the resource limit is fine after all
-            struct rlimit lock_limit;
-            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
-                suggest = false;
-            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
-                suggest = false;
-
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
-                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
-            return false;
-        }
-    }
-
-    #undef MLOCK_SUGGESTION
-
-    void raw_unlock(void * addr, size_t size) {
-        if (munlock(addr, size)) {
-            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
-        }
-    }
-#elif defined(_WIN32)
-    static constexpr bool SUPPORTED = true;
-
-    size_t lock_granularity() {
-        SYSTEM_INFO si;
-        GetSystemInfo(&si);
-        return (size_t) si.dwPageSize;
-    }
-
-    bool raw_lock(void * addr, size_t size) {
-        for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
-                return true;
-            }
-            if (tries == 2) {
-                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-
-            // It failed but this was only the first try; increase the working
-            // set size and try again.
-            SIZE_T min_ws_size, max_ws_size;
-            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
-                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-            // Per MSDN: "The maximum number of pages that a process can lock
-            // is equal to the number of pages in its minimum working set minus
-            // a small overhead."
-            // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
-            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += increment;
-            max_ws_size += increment;
-            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
-                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
-                        llama_format_win_err(GetLastError()).c_str());
-                return false;
-            }
-        }
-    }
-
-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
-            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
-        }
-    }
-#else
-    static constexpr bool SUPPORTED = false;
-
-    void raw_lock(const void * addr, size_t size) {
-        fprintf(stderr, "warning: mlock not supported on this system\n");
-    }
-
-    void raw_unlock(const void * addr, size_t size) {}
-#endif
-};
-
-// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct llama_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        free(addr);
-        addr = (uint8_t *)memalign(32, size);
-        this->size = size;
-    }
-
-    ~llama_buffer() {
-        free(addr);
-    }
-};
-#endif
diff --git a/third_party/radpajama/main-redpajama-chat.cpp b/third_party/radpajama/main-redpajama-chat.cc
similarity index 81%
rename from third_party/radpajama/main-redpajama-chat.cpp
rename to third_party/radpajama/main-redpajama-chat.cc
index ae4d358d8..ff0e12422 100644
--- a/third_party/radpajama/main-redpajama-chat.cpp
+++ b/third_party/radpajama/main-redpajama-chat.cc
@@ -1,29 +1,64 @@
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include "common-gptneox.h"
-#include "gptneox.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-#include <algorithm>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#include <signal.h>
-#endif
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/libcxx/algorithm"
+#include "third_party/libcxx/cassert"
+#include "third_party/libcxx/cinttypes"
+#include "third_party/libcxx/cmath"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/ctime"
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/string"
+#include "third_party/libcxx/vector"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#include "third_party/radpajama/common-gptneox.h"
+#include "third_party/radpajama/gptneox.h"
+// clang-format off
 
 static console_state con_st;
 static gptneox_context ** g_ctx;
diff --git a/third_party/radpajama/main-redpajama.cpp b/third_party/radpajama/main-redpajama.cc
similarity index 86%
rename from third_party/radpajama/main-redpajama.cpp
rename to third_party/radpajama/main-redpajama.cc
index a37740b96..f17ebf308 100644
--- a/third_party/radpajama/main-redpajama.cpp
+++ b/third_party/radpajama/main-redpajama.cc
@@ -1,28 +1,63 @@
-// Defines sigaction on msys:
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include "common-gptneox.h"
-#include "gptneox.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#include <signal.h>
-#endif
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "libc/calls/calls.h"
+#include "libc/calls/sigtimedwait.h"
+#include "libc/calls/struct/sigaction.h"
+#include "libc/calls/struct/siginfo.h"
+#include "libc/calls/weirdtypes.h"
+#include "libc/runtime/pathconf.h"
+#include "libc/runtime/runtime.h"
+#include "libc/runtime/sysconf.h"
+#include "libc/sysv/consts/f.h"
+#include "libc/sysv/consts/fileno.h"
+#include "libc/sysv/consts/o.h"
+#include "libc/sysv/consts/ok.h"
+#include "libc/sysv/consts/sa.h"
+#include "libc/sysv/consts/sicode.h"
+#include "libc/sysv/consts/ss.h"
+#include "libc/time/time.h"
+#include "third_party/getopt/getopt.h"
+#include "third_party/libcxx/cassert"
+#include "third_party/libcxx/cinttypes"
+#include "third_party/libcxx/cmath"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/cstring"
+#include "third_party/libcxx/ctime"
+#include "third_party/libcxx/fstream"
+#include "third_party/libcxx/iostream"
+#include "third_party/libcxx/string"
+#include "third_party/libcxx/vector"
+#include "third_party/musl/crypt.h"
+#include "third_party/musl/lockf.h"
+#include "third_party/radpajama/common-gptneox.h"
+#include "third_party/radpajama/gptneox.h"
+// clang-format off
 
 static console_state con_st;
 static gptneox_context ** g_ctx;
@@ -288,7 +323,6 @@ int main(int argc, char ** argv) {
         is_interacting = params.interactive_first;
     }
 
-    bool is_antiprompt = false;
     bool input_noecho  = false;
     
     // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
@@ -516,12 +550,10 @@ int main(int argc, char ** argv) {
                     last_output += gptneox_token_to_str(ctx, id);
                 }
 
-                is_antiprompt = false;
                 // Check if each of the reverse prompts appears at the end of the output.
                 for (std::string & antiprompt : params.antiprompt) {
                     if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                         is_interacting = true;
-                        is_antiprompt = true;
                         set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
                         fflush(stdout);
                         break;
@@ -619,4 +651,4 @@ int main(int argc, char ** argv) {
     set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/third_party/radpajama/main.cc b/third_party/radpajama/main.cc
deleted file mode 100644
index 40078ab9b..000000000
--- a/third_party/radpajama/main.cc
+++ /dev/null
@@ -1,891 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Justine Alexandra Roberts Tunney                         │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/stat.h"
-#include "libc/intrin/bits.h"
-#include "libc/log/log.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/stdio/stdio.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/msync.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/sig.h"
-#include "third_party/ggml/common.h"
-#include "third_party/ggml/llama.h"
-#include "third_party/ggml/llama_util.h"
-#include "third_party/libcxx/atomic"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-static std::atomic<bool> is_interacting;
-static std::atomic<bool> is_terminated;
-
-#define EPHEMERAL(fmt) "\r\e[K\033[1;35m" fmt " \033[0m"
-
-static void sigint_handler_batch(int signo) {
-    is_terminated = true;
-}
-
-static void sigint_handler_interactive(int signo) {
-    if (!is_interacting) {
-        is_interacting = true;
-    } else {
-        is_terminated = true;
-    }
-}
-
-static int CompareTime(struct timespec a, struct timespec b) {
-  int cmp;
-  if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec))) {
-    cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec);
-  }
-  return cmp;
-}
-
-static int on_missing_feature(const char *name) {
-    fprintf(stderr, "%s: error: cpuid %s not detected\n", __func__, name);
-    fprintf(stderr, "%s: amd microprocessors made after 2017 usually work\n", __func__);
-    fprintf(stderr, "%s: intel microprocessors made after 2013 usually work\n", __func__);
-    return 1;
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    ShowCrashReports();
-    setvbuf(stdin, NULL, _IONBF, 0);
-    setvbuf(stdout, NULL, _IONBF, 0);
-    setvbuf(stderr, NULL, _IONBF, 0);
-
-    params.model = "models/llama-7B/ggml-model.bin";
-
-#ifdef __x86_64__
-    if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
-    if (!X86_HAVE(AVX)) return on_missing_feature("avx");
-    if (!X86_HAVE(FMA)) return on_missing_feature("fma");
-    if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
-    if (!X86_HAVE(F16C)) {
-        fprintf(stderr, "%s: warning: cpuid f16c not detected; inference might crash\n", __func__);
-    }
-#endif /* __x86_64__ */
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    static console_state con_st;
-    con_st.use_color = params.use_color;
-
-    con_st.multiline_input = params.multiline_input;
-    console_init(con_st);
-    atexit([]() { console_cleanup(con_st); });
-
-    if (params.perplexity) {
-        printf("\n************\n");
-        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.embedding) {
-        printf("\n************\n");
-        printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-
-    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
-                "expect poor results\n", __func__, params.n_ctx);
-    }
-
-    if (params.seed < 0) {
-        params.seed = time(NULL);
-    }
-
-    if (params.verbose) {
-        fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
-    }
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
-
-    llama_context * ctx;
-    struct stat model_stat;
-
-    // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    stat(params.model.c_str(), &model_stat);
-
-    if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(ctx,
-                                             params.lora_adapter.c_str(),
-                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                             params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
-    }
-
-    // print system information
-    if (params.verbose) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
-                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
-    }
-
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            const std::vector<llama_token> tmp(params.n_batch, 0);
-            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
-
-        {
-            const std::vector<llama_token> tmp = { 0, };
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
-        }
-
-        if (params.verbose) {
-            llama_print_timings(ctx);
-        }
-        llama_free(ctx);
-
-        return 0;
-    }
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
-    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
-
-    const int n_ctx = llama_n_ctx(ctx);
-
-    if ((int) embd_inp.size() > n_ctx - 4) {
-        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
-        return 1;
-    }
-
-    // number of tokens to keep when resetting context
-    int n_keep = params.n_keep;
-    if (n_keep < 0 || n_keep > (int)embd_inp.size() || params.instruct) {
-        n_keep = (int)embd_inp.size();
-    }
-    if (!n_keep && !params.n_keep_str.empty()) {
-        auto pivot = ::llama_tokenize(ctx, params.n_keep_str, false);
-        auto pos = std::search(embd_inp.begin(), embd_inp.end(),
-                               pivot.begin(), pivot.end());
-        if (pos == embd_inp.end()) {
-            fprintf(stderr, "%s: error: --n_keep %`'s substring not found within prompt\n",
-                    __func__, params.n_keep_str.c_str());
-            return 1;
-        }
-        n_keep = (pos - embd_inp.begin()) + (pivot.end() - pivot.begin());
-    }
-
-    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_first = true;
-        params.antiprompt.push_back("### Instruction:\n\n");
-    }
-
-    // enable interactive mode if interactive start is specified
-    if (params.interactive_first) {
-        params.interactive = true;
-    }
-
-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
-    if (params.verbose_prompt) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d %6d -> %`'s\n", i, embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "%s: first part of prompt: \"", __func__);
-        for (int i = 0; i < n_keep; i++) {
-            fprintf(stderr, "%'s", llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "\"\n");
-        fprintf(stderr, "%s: second part of prompt: \"", __func__);
-        for (int i = n_keep; i < embd_inp.size(); i++) {
-            fprintf(stderr, "%'s", llama_token_to_str(ctx, embd_inp[i]));
-        }
-        fprintf(stderr, "\"\n");
-        fprintf(stderr, "\n");
-    }
-
-    // setup ctrl-c handler
-    struct sigaction sa;
-    sa.sa_flags = 0;
-    sigemptyset(&sa.sa_mask);
-    if (params.interactive) {
-        sa.sa_handler = sigint_handler_interactive;
-    } else {
-        sa.sa_handler = sigint_handler_batch;
-    }
-    sigaction(SIGINT, &sa, NULL);
-
-    if (params.interactive) {
-        if (params.verbose) {
-            fprintf(stderr, "%s: interactive mode on.\n", __func__);
-        }
-
-        if (params.verbose && params.antiprompt.size()) {
-            for (auto antiprompt : params.antiprompt) {
-                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
-            }
-        }
-
-        if (params.verbose && !params.input_prefix.empty()) {
-            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
-        }
-    }
-
-    if (params.verbose) {
-        fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-                params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-        fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n",
-                n_ctx, params.n_batch, params.n_predict, n_keep);
-        fprintf(stderr, "\n\n");
-    }
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_n_tokens(n_ctx);
-    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-
-    if (params.verbose && params.interactive) {
-        fprintf(stderr, "== Running in interactive mode. ==\n"
-               " - Press Ctrl+C to interject at any time.\n"
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_first;
-    }
-
-    const uint32_t kJtlpMagic = READ32LE("jtlp");
-    const uint32_t kJtlpVersion = 0;
-
-    struct jtlp_header {
-        uint8_t magic[4];
-        uint8_t version[4];
-        uint8_t state_size[8];
-        uint8_t model_dev[8];
-        uint8_t model_ino[8];
-        uint8_t model_mtim_sec[8];
-        uint8_t model_mtim_nsec[8];
-        uint8_t prompt_size[8];
-    };
-
-    enum jtlp_status {
-        kPromptPending,
-        kPromptCompleted,
-        kPromptFinished
-    };
-
-    enum jtlp_status prompt_status = kPromptPending;
-
-    bool is_antiprompt = false;
-    bool input_noecho  = !params.verbose;
-
-    int n_past     = 0;
-    int n_remain   = params.n_predict;
-    int n_consumed = 0;
-
-    // instantly reload prompt if it's cached
-    int fd = open(params.prompt_path.c_str(), O_RDONLY);
-    if (fd != -1) {
-        size_t state_size;
-        size_t prompt_size;
-        struct timespec mtim;
-        struct jtlp_header *header;
-        off_t rc = lseek(fd, 0, SEEK_END);
-        LLAMA_ASSERT(rc != -1);
-        void *map = MAP_FAILED;
-        size_t file_size = rc;
-        if (file_size < sizeof(header)) {
-            fprintf(stderr, "%s: prompt file too small\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        map = mmap(0, file_size, PROT_READ, MAP_SHARED, fd, 0);
-        if (map == MAP_FAILED) {
-            fprintf(stderr, "%s: mmap failed: %s\n",
-                    params.prompt_path.c_str(), strerror(errno));
-            goto CantReloadPrompt;
-        }
-        header = (struct jtlp_header *)map;
-        // check file format magic
-        if (READ32LE(header->magic) != kJtlpMagic) {
-            fprintf(stderr, "%s: prompt file has wrong magic\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check file format version
-        if (READ32LE(header->version) > kJtlpVersion) {
-            fprintf(stderr, "%s: prompt has future file format version\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check expected state size
-        state_size = llama_get_state_size(ctx);
-        if (READ64LE(header->state_size) != state_size) {
-            if (params.verbose) {
-                fprintf(stderr, "%s: prompt has stale data state size\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // check model device id
-        if (READ64LE(header->model_dev) != model_stat.st_dev) {
-            fprintf(stderr, "%s: prompt is for different model (dev)\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check model inode id
-        if (READ64LE(header->model_ino) != model_stat.st_ino) {
-            fprintf(stderr, "%s: prompt is for different model (ino)\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check model modified timestamp
-        mtim.tv_sec = READ64LE(header->model_mtim_sec);
-        mtim.tv_nsec = READ64LE(header->model_mtim_nsec);
-        if (CompareTime(model_stat.st_mtim, mtim) > 0) {
-            if (params.verbose) {
-                fprintf(stderr, "%s: model file timestamp changed; will reload and regenerate prompt\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // check prompt file size
-        prompt_size = READ64LE(header->prompt_size);
-        if (sizeof(struct jtlp_header) + prompt_size + state_size > file_size) {
-            fprintf(stderr, "%s: prompt file size unexpected\n",
-                    params.prompt_path.c_str());
-            goto CantReloadPrompt;
-        }
-        // check prompt textus
-        if (prompt_size != params.prompt.size() ||
-            memcmp(header + 1, params.prompt.c_str(), prompt_size) != 0) {
-            if (params.verbose) {
-                fprintf(stderr, "%s: prompt text changed; will reload and regenerate\n",
-                        params.prompt_path.c_str());
-            }
-            goto CantReloadPrompt;
-        }
-        // read the transformer state
-        llama_set_state_data(ctx, (uint8_t *)(header + 1) + prompt_size);
-        // we're finished loading the prompt file
-        if (params.verbose) {
-            fprintf(stderr, "%s: %s: reloaded previously saved prompt\n",
-                    __func__, params.prompt_path.c_str());
-        }
-        // now setup the business logic
-        llama_set_rng_seed(ctx, params.seed);
-        while ((int) embd_inp.size() > n_consumed) {
-            last_n_tokens.erase(last_n_tokens.begin());
-            last_n_tokens.push_back(embd_inp[n_consumed++]);
-        }
-        n_past = n_consumed;
-        prompt_status = kPromptFinished;
-        if (params.interactive) {
-            is_interacting = true;
-            for (std::string & antiprompt : params.antiprompt) {
-                auto toks = ::llama_tokenize(ctx, antiprompt, false);
-                if (std::equal(last_n_tokens.end() - toks.size(),
-                               last_n_tokens.end(),
-                               toks.begin(),
-                               toks.end())) {
-                    console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-                    printf("%s", antiprompt.c_str());
-                    fflush(stdout);
-                    break;
-                }
-            }
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-        }
-  CantReloadPrompt:
-        if (map != MAP_FAILED) {
-            munmap(map, file_size);
-        }
-        close(fd);
-    }
-
-    if (prompt_status == kPromptPending && params.verbose) {
-        // the first thing we will do is to output the prompt, so set color accordingly
-        console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-    }
-
-    std::vector<llama_token> embd;
-
-    if (prompt_status == kPromptPending &&
-        !params.verbose && con_st.use_color) {
-        fprintf(stderr, EPHEMERAL("loading weights..."));
-    }
-
-    while ((n_remain != 0 || params.interactive) && !is_terminated) {
-
-        // perform evaluation
-        if (embd.size() > 0) {
-            if (n_past + (int) embd.size() > n_ctx) {
-                n_past = n_keep;
-                embd.insert(embd.begin(),
-                            last_n_tokens.end() - (n_past - n_keep) / 2 - embd.size(),
-                            last_n_tokens.end() - embd.size());
-            }
-            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
-                int n_eval = (int) embd.size() - i;
-                if (n_eval > params.n_batch) {
-                    n_eval = params.n_batch;
-                }
-                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
-                    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-                    return 1;
-                }
-                n_past += n_eval;
-                if (prompt_status == kPromptPending &&
-                    !params.verbose && con_st.use_color && embd_inp.size()) {
-                    fprintf(stderr, EPHEMERAL("loading prompt %d%% ..."),
-                            (int)(n_consumed / (double)embd_inp.size() * 100));
-                }
-            }
-            embd.clear();
-        }
-
-        // save prompt to disk atomically as soon as it's finished loading
-        bool was_completed = prompt_status == kPromptCompleted;
-        if (was_completed && !params.prompt_path.empty()) {
-            int fd = -1;
-            int close_rc;
-            uint8_t buf[8];
-            size_t file_size;
-            size_t state_size;
-            std::string tmppath;
-            void *map = MAP_FAILED;
-            struct jtlp_header header;
-            if (!params.verbose && con_st.use_color) {
-                fprintf(stderr, EPHEMERAL("caching prompt..."));
-            }
-            state_size = llama_get_state_size(ctx);
-            WRITE32LE(header.magic, kJtlpMagic);
-            WRITE32LE(header.version, kJtlpVersion);
-            WRITE64LE(header.state_size, state_size);
-            WRITE64LE(header.model_dev, model_stat.st_dev);
-            WRITE64LE(header.model_ino, model_stat.st_ino);
-            WRITE64LE(header.model_mtim_sec, model_stat.st_mtim.tv_sec);
-            WRITE64LE(header.model_mtim_nsec, model_stat.st_mtim.tv_nsec);
-            WRITE64LE(header.prompt_size, params.prompt.size());
-            file_size = sizeof(header) + params.prompt.size() + state_size;
-            tmppath.append(params.prompt_path);
-            tmppath.append(".XXXXXX");
-            fd = mkstemp(&tmppath[0]);
-            if (fd == -1) {
-                fprintf(stderr, "%s: mkstemp failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            if (ftruncate(fd, file_size)) {
-                fprintf(stderr, "%s: ftruncate failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            map = mmap(0, file_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-            if (map == MAP_FAILED) {
-                fprintf(stderr, "%s: mmap failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            llama_copy_state_data(ctx, (uint8_t *)map + sizeof(header) + params.prompt.size());
-            memcpy((uint8_t *)map + sizeof(header), params.prompt.c_str(), params.prompt.size());
-            memcpy(map, &header, sizeof(header));
-            if (msync(map, file_size, MS_ASYNC) && params.verbose) {
-                fprintf(stderr, "%s: msync failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-            }
-            if (munmap(map, file_size) && params.verbose) {
-                fprintf(stderr, "%s: munmap failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-            }
-            map = MAP_FAILED;
-            close_rc = close(fd);
-            fd = -1;
-            if (close_rc) {
-                fprintf(stderr, "%s: close failed: %s\n",
-                        tmppath.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            if (rename(tmppath.c_str(), params.prompt_path.c_str())) {
-                fprintf(stderr, "%s -> %s: rename failed: %s\n",
-                        tmppath.c_str(), params.prompt_path.c_str(), strerror(errno));
-                goto CouldNotSavePrompt;
-            }
-            tmppath.clear();
-      CouldNotSavePrompt:
-            if (map != MAP_FAILED) munmap(map, file_size);
-            if (fd != -1) close(fd);
-            if (!tmppath.empty()) unlink(tmppath.c_str());
-        }
-        if (was_completed) {
-            if (!params.verbose && con_st.use_color) {
-                fprintf(stderr, EPHEMERAL(""));
-            }
-            if (params.interactive) {
-                is_interacting = true;
-            }
-            prompt_status = kPromptFinished;
-            if (params.interactive) {
-                is_interacting = true;
-                fflush(stdout);
-                std::string last_output;
-                for (auto id : last_n_tokens) {
-                    last_output += llama_token_to_str(ctx, id);
-                }
-                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(),
-                                         last_output.length() - antiprompt.length(),
-                                         antiprompt.length()) != std::string::npos) {
-                        console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-                        printf("%s", antiprompt.c_str());
-                        fflush(stdout);
-                        break;
-                    }
-                }
-                console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            }
-        }
-
-        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-            // out of user input, sample next token
-            const float   temp            = params.temp;
-            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
-            const float   top_p           = params.top_p;
-            const float   tfs_z           = params.tfs_z;
-            const float   typical_p       = params.typical_p;
-            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-            const float   repeat_penalty  = params.repeat_penalty;
-            const float   alpha_presence  = params.presence_penalty;
-            const float   alpha_frequency = params.frequency_penalty;
-            const int     mirostat        = params.mirostat;
-            const float   mirostat_tau    = params.mirostat_tau;
-            const float   mirostat_eta    = params.mirostat_eta;
-            const bool    penalize_nl     = params.penalize_nl;
-
-            llama_token id = 0;
-
-            {
-                auto logits  = llama_get_logits(ctx);
-                auto n_vocab = llama_n_vocab(ctx);
-
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<llama_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                llama_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = llama_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        llama_sample_temperature(ctx, &candidates_p, temp);
-                        id = llama_sample_token(ctx, &candidates_p);
-                    }
-                }
-
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(id);
-            }
-
-            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
-                id = llama_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
-            // add it to the context
-            embd.push_back(id);
-
-            // echo this to console
-            input_noecho = false;
-
-            // decrement remaining sampling budget
-            --n_remain;
-
-        } else {
-            // some user input remains from prompt or interaction, forward it to processing
-            while ((int) embd_inp.size() > n_consumed) {
-                embd.push_back(embd_inp[n_consumed]);
-                last_n_tokens.erase(last_n_tokens.begin());
-                last_n_tokens.push_back(embd_inp[n_consumed++]);
-                if ((int) embd.size() >= params.n_batch) {
-                    break;
-                }
-            }
-
-            // we've nearly finished loading the prompt
-            if (prompt_status == kPromptPending &&
-                (int) embd_inp.size() <= n_consumed) {
-                prompt_status = kPromptCompleted;
-            }
-        }
-
-        // checks for reverse prompt
-        //
-        // 1. in interactive mode, this lets us detect when the llm is
-        //    prompting the user, so we can pause for input, e.g.
-        //
-        //       --interactive
-        //       --prompt $'CompanionAI: How can I help you?\nHuman:'
-        //       --reverse-prompt 'Human:'
-        //
-        // 2. in normal mode, the reverse prompt can be used to specify
-        //    a custom EOS token, e.g.
-        //
-        //       --prompt 'Question: How old are you?\nAnswer: '
-        //       --reverse-prompt $'\n'
-        //
-        if (params.antiprompt.size()) {
-            std::string last_output;
-            for (auto id : last_n_tokens) {
-                last_output += llama_token_to_str(ctx, id);
-            }
-            is_antiprompt = false;
-            // Check if each of the reverse prompts appears at the end of the output.
-            for (std::string & antiprompt : params.antiprompt) {
-                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                    is_antiprompt = true;
-                    break;
-                }
-            }
-            if (is_antiprompt && !params.interactive) {
-                printf("\n");
-                break;
-            }
-        }
-
-        // display text
-        if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
-            }
-            fflush(stdout);
-        }
-        if (prompt_status == kPromptCompleted) {
-            continue;  // avoid reading line before last token loads
-        }
-
-        // reset color to default if we there is no pending user input
-        if (params.verbose && !input_noecho && (int)embd_inp.size() == n_consumed) {
-            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-        }
-
-        if (is_antiprompt) {
-            is_interacting = true;
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            fflush(stdout);
-        }
-
-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
-
-            if (n_past > 0 && is_interacting) {
-
-                // potentially set color to indicate we are taking user input
-                console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-
-                if (params.instruct) {
-                    printf("\n> ");
-                }
-
-                std::string buffer;
-                if (!params.input_prefix.empty()) {
-                    buffer += params.input_prefix;
-                    printf("%s", buffer.c_str());
-                }
-
-                // display a "waiting for input" indicator, just in case
-                // the model doesn't halt on the antiprompt.
-                if (con_st.use_color) {
-                    fprintf(stdout, "?\b");
-                    fflush(stdout);
-                }
-
-                std::string line;
-                bool another_line = true;
-                do {
-                    another_line = console_readline(con_st, line);
-                    buffer += line;
-                } while (another_line);
-
-                // done taking input, reset color
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-                // Add tokens to embd only if the input buffer is non-empty
-                // Entering a empty line lets the user pass control back
-                if (buffer.length() > 1) {
-                    // append input suffix if any
-                    if (!params.input_suffix.empty()) {
-                        buffer += params.input_suffix;
-                        printf("%s", params.input_suffix.c_str());
-                    }
-
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-                    }
-
-                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
-
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                    }
-
-                    n_remain -= line_inp.size();
-                }
-
-                input_noecho = true; // do not echo this again
-            }
-
-            if (n_past > 0) {
-                is_interacting = false;
-            }
-            assert(!is_interacting);
-        }
-
-        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (params.instruct) {
-                is_interacting = true;
-            } else if (params.verbose) {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
-        }
-
-        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
-            n_remain = params.n_predict;
-            is_interacting = true;
-        }
-    }
-
-    if (is_terminated) {
-        console_cleanup(con_st);
-        printf("\n");
-        if (params.verbose) {
-            llama_print_timings(ctx);
-        }
-        _exit(128 + SIGINT);
-    }
-
-    if (params.verbose) {
-        llama_print_timings(ctx);
-    }
-    llama_free(ctx);
-
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-    return 0;
-}
diff --git a/third_party/radpajama/quantize-gptneox.cc b/third_party/radpajama/quantize-gptneox.cc
new file mode 100644
index 000000000..27b0b4816
--- /dev/null
+++ b/third_party/radpajama/quantize-gptneox.cc
@@ -0,0 +1,110 @@
+/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
+│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
+╚──────────────────────────────────────────────────────────────────────────────╝
+│                                                                              │
+│  radpajama.com                                                               │
+│  Copyright (c) 2023 Ariel Núñez                                              │
+│  Copyright (c) 2023 Georgi Gerganov                                          │
+│                                                                              │
+│  Permission is hereby granted, free of charge, to any person obtaining       │
+│  a copy of this software and associated documentation files (the             │
+│  "Software"), to deal in the Software without restriction, including         │
+│  without limitation the rights to use, copy, modify, merge, publish,         │
+│  distribute, sublicense, and/or sell copies of the Software, and to          │
+│  permit persons to whom the Software is furnished to do so, subject to       │
+│  the following conditions:                                                   │
+│                                                                              │
+│  The above copyright notice and this permission notice shall be              │
+│  included in all copies or substantial portions of the Software.             │
+│                                                                              │
+│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
+│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
+│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
+│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
+│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
+│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
+│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
+│                                                                              │
+╚─────────────────────────────────────────────────────────────────────────────*/
+#include "third_party/ggml/ggml.h"
+#include "third_party/libcxx/cstdio"
+#include "third_party/libcxx/map"
+#include "third_party/libcxx/string"
+#include "third_party/radpajama/gptneox.h"
+// clang-format off
+
+static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
+  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
+  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
+  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
+  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
+  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
+  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
+  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
+};
+
+// usage:
+//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    if (argc < 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
+        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
+            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+        }
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    enum gptneox_ftype ftype;
+    if (argv[3][0] == 'q') {
+        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
+        if (it == GPTNEOX_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum gptneox_ftype)atoi(argv[3]);
+    }
+
+    int nthread = argc > 4 ? atoi(argv[4]) : 0;
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (gptneox_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
diff --git a/third_party/radpajama/quantize-gptneox.cpp b/third_party/radpajama/quantize-gptneox.cpp
deleted file mode 100644
index 1f2f69555..000000000
--- a/third_party/radpajama/quantize-gptneox.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "ggml.h"
-#include "gptneox.h"
-
-#include <cstdio>
-#include <map>
-#include <string>
-
-static const std::map<std::string, enum gptneox_ftype> GPTNEOX_FTYPE_MAP = {
-  {"q4_0", GPTNEOX_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", GPTNEOX_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", GPTNEOX_FTYPE_MOSTLY_Q4_2},
-  //{"q4_3", GPTNEOX_FTYPE_MOSTLY_Q4_3},
-  {"q5_0", GPTNEOX_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", GPTNEOX_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", GPTNEOX_FTYPE_MOSTLY_Q8_0},
-};
-
-// usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
-//
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
-        for (auto it = GPTNEOX_FTYPE_MAP.begin(); it != GPTNEOX_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
-    }
-
-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
-
-    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
-
-    enum gptneox_ftype ftype;
-    if (argv[3][0] == 'q') {
-        auto it = GPTNEOX_FTYPE_MAP.find(argv[3]);
-        if (it == GPTNEOX_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
-            return 1;
-        }
-        ftype = it->second;
-    } else {
-        ftype = (enum gptneox_ftype)atoi(argv[3]);
-    }
-
-    int nthread = argc > 4 ? atoi(argv[4]) : 0;
-
-    const int64_t t_main_start_us = ggml_time_us();
-
-    int64_t t_quantize_us = 0;
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (gptneox_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
-            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
-            return 1;
-        }
-
-        t_quantize_us = ggml_time_us() - t_start_us;
-    }
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/third_party/radpajama/radpajama.cc b/third_party/radpajama/radpajama.cc
deleted file mode 100644
index ab2c27dcf..000000000
--- a/third_party/radpajama/radpajama.cc
+++ /dev/null
@@ -1,3395 +0,0 @@
-/*-*-mode:c++;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8-*-│
-│vi: set net ft=c++ ts=4 sts=4 sw=4 fenc=utf-8                              :vi│
-╚──────────────────────────────────────────────────────────────────────────────╝
-│                                                                              │
-│  llama.com                                                                   │
-│  Copyright (c) 2023 Justine Alexandra Roberts Tunney                         │
-│  Copyright (c) 2023 Georgi Gerganov                                          │
-│                                                                              │
-│  Permission is hereby granted, free of charge, to any person obtaining       │
-│  a copy of this software and associated documentation files (the             │
-│  "Software"), to deal in the Software without restriction, including         │
-│  without limitation the rights to use, copy, modify, merge, publish,         │
-│  distribute, sublicense, and/or sell copies of the Software, and to          │
-│  permit persons to whom the Software is furnished to do so, subject to       │
-│  the following conditions:                                                   │
-│                                                                              │
-│  The above copyright notice and this permission notice shall be              │
-│  included in all copies or substantial portions of the Software.             │
-│                                                                              │
-│  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │
-│  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │
-│  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │
-│  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │
-│  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │
-│  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │
-│  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │
-│                                                                              │
-╚─────────────────────────────────────────────────────────────────────────────*/
-#include "libc/assert.h"
-#include "libc/calls/calls.h"
-#include "libc/calls/struct/sigaction.h"
-#include "libc/calls/struct/stat.h"
-#include "libc/intrin/bits.h"
-#include "libc/log/log.h"
-#include "libc/nexgen32e/x86feature.h"
-#include "libc/stdio/stdio.h"
-#include "libc/sysv/consts/map.h"
-#include "libc/sysv/consts/msync.h"
-#include "libc/sysv/consts/o.h"
-#include "libc/sysv/consts/prot.h"
-#include "libc/sysv/consts/sig.h"
-#include "third_party/libcxx/map"
-#include "third_party/libcxx/atomic"
-#include "third_party/libcxx/iostream"
-#include "third_party/libcxx/string"
-#include "third_party/libcxx/vector"
-#include "third_party/radpajama/ggml.h"
-#include "third_party/radpajama/common.h"
-
-asm(".ident\t\"\\n\\n\
-llama.cpp (MIT License)\\n\
-Copyright (c) 2023 Georgi Gerganov\"");
-asm(".include \"libc/disclaimer.inc\"");
-// clang-format off
-
-
-
-// TODO: Add back in n_ctx (max_position_embeddings) to ggml model, it is currently hard-coded to 2048 max for llama
-
-#define GPTNEOX_USE_SCRATCH
-#define GPTNEOX_MAX_SCRATCH_BUFFERS 16
-
-// available open-assistant based gptneox models
-// OpenAssistant/stablelm-7b-sft-v7-epoch-3
-// OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5
-enum e_model {
-    MODEL_UNKNOWN,
-    MODEL_3B, // StabilityAI Base Alpha 3B
-    MODEL_7B,
-    MODEL_12B,
-    MODEL_20B,
-};
-
-static const size_t MiB = 1024*1024;
-static console_state con_st;
-static gptneox_context ** g_ctx;
-
-void sigint_handler(int signo) {
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    printf("\n"); // this also force flush stdout.
-    if (signo == SIGINT) {
-        if (!is_interacting) {
-            is_interacting=true;
-        } else {
-            gptneox_print_timings(*g_ctx);
-            _exit(130);
-        }
-    }
-}
-
-// computed for n_ctx == 2048
-// TODO: dynamically determine these sizes
-// TODO: To load the stablelm 3B model on my test XR will require some tricks, small ggml context size, mmap support, among others, but is maybe feasible, is a smaller n_ctx required? 512 instead of 2048/4096? Does mmap work as desired on iOS?
-//       needs modifications in ggml
-
-// TODO: Modify for gptneox, how are these values actually determined?
-// TODO: This is now priority, 
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
-{
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
-        { MODEL_3B,    128ull * MiB },
-        { MODEL_7B,    512ull * MiB },
-        { MODEL_12B,   512ull * MiB },
-        { MODEL_20B,   512ull * MiB },
-    };
-    return _MEM_REQ_SCRATCH0;
-}
-
-// TODO: Modify for gptneox, how are these values actually determined?
-static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
-{
-    static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
-        { MODEL_3B,    128ull * MiB },
-        { MODEL_7B,    512ull * MiB },
-        { MODEL_12B,   512ull * MiB },
-        { MODEL_20B,   512ull * MiB },
-    };
-    return _MEM_REQ_SCRATCH1;
-}
-
-// TODO: Modify for gptneox, how are these values actually determined?
-// 2*n_embd*n_ctx*n_layer*sizeof(float16)
-// llama 7B: 2 * 768 * 32 * 2 = 98304
-static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
-{
-    static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
-        { MODEL_3B,   512ull * MiB },
-        { MODEL_7B,   1026ull * MiB },
-        { MODEL_12B,  1608ull * MiB },
-        { MODEL_20B,  1608ull * MiB },
-    };
-    return _MEM_REQ_KV_SELF;
-}
-
-// TODO: Modify for gptneox, how are these values actually determined?
-// this is mostly needed for temporary mul_mat buffers to dequantize the data
-// not actually needed if BLAS is disabled
-static const std::map<e_model, size_t> & MEM_REQ_EVAL()
-{
-    static std::map<e_model, size_t> _MEM_REQ_EVAL = {
-        { MODEL_3B,   512ull * MiB },
-        { MODEL_7B,   768ull * MiB },
-        { MODEL_12B, 1024ull * MiB },
-        { MODEL_20B, 1024ull * MiB },
-    };
-    return _MEM_REQ_EVAL;
-}
-
-// default hparams (GPT-NeoX oasst 12B)
-struct gptneox_hparams {
-    uint32_t n_vocab = 50288;
-    uint32_t n_ctx   = 4096;   // this is provided as user input?
-    uint32_t n_embd  = 5120;
-    uint32_t n_head  = 40;
-    uint32_t n_layer = 36;
-    uint32_t n_rot   = 32;
-    uint32_t use_parallel_residual = 1; // 1 = true, 0 = false
-    enum gptneox_ftype ftype = GPTNEOX_FTYPE_MOSTLY_F16;
-
-    bool operator!=(const gptneox_hparams & other) const {
-        return memcmp(this, &other, sizeof(gptneox_hparams));
-    }
-};
-
-struct gptneox_layer {
-    // input_layernorm
-    struct ggml_tensor * ln_attn_g;
-    struct ggml_tensor * ln_attn_b;
-
-    // post_attention_layernorm
-    struct ggml_tensor * ln_ff_g;
-    struct ggml_tensor * ln_ff_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // ff
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gptneox_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx = NULL;
-
-    gptneox_buffer buf;
-
-    int n; // number of tokens currently in the cache
-
-    ~gptneox_kv_cache() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-    }
-};
-
-struct gptneox_model {
-    e_model type = MODEL_UNKNOWN;
-
-    gptneox_hparams hparams;
-
-    // final normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    // word embedding
-    struct ggml_tensor * wte;
-
-    // language model head
-    struct ggml_tensor * lmh_g;
-
-    std::vector<gptneox_layer> layers;
-
-    // context
-    struct ggml_context * ctx = NULL;
-
-    // key + value cache for the self attention
-    // TODO: move to gptneox_state
-    struct gptneox_kv_cache kv_self;
-
-    // the model memory buffer
-    gptneox_buffer buf;
-
-    // model memory mapped file
-    std::unique_ptr<gptneox_mmap> mapping;
-
-    // objects representing data potentially being locked in memory
-    gptneox_mlock mlock_buf;
-    gptneox_mlock mlock_mmap;
-
-    // for quantize-stats only
-    std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
-
-    ~gptneox_model() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-    }
-};
-
-struct gptneox_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    struct token_score {
-        token tok;
-        float score;
-    };
-
-    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
-};
-
-struct gptneox_context {
-    std::mt19937 rng;
-
-    int64_t t_load_us = 0;
-    int64_t t_start_us = 0;
-    bool has_evaluated_once = false;
-
-    int64_t t_sample_us = 0;
-    int64_t t_eval_us   = 0;
-    int64_t t_p_eval_us = 0;
-
-    int32_t n_sample = 0; // number of tokens sampled
-    int32_t n_eval   = 0; // number of eval calls
-    int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
-
-    gptneox_model model;
-    gptneox_vocab vocab;
-
-    size_t mem_per_token = 0;
-
-    // decode output (2-dimensional array: [n_tokens][n_vocab])
-    std::vector<float> logits;
-    bool logits_all = false;
-
-    // input embedding (1-dimensional array: [n_embd])
-    std::vector<float> embedding;
-
-    // memory buffers used to evaluate the model
-    // TODO: move in gptneox_state
-    gptneox_buffer buf_compute;
-    gptneox_buffer buf_scratch[GPTNEOX_MAX_SCRATCH_BUFFERS];
-
-    int    buf_last = 0;
-    size_t buf_max_size[GPTNEOX_MAX_SCRATCH_BUFFERS] = { 0 };
-
-    void use_buf(struct ggml_context * ctx, int i) {
-#if defined(GPTNEOX_USE_SCRATCH)
-        size_t last_size = 0;
-
-        if (i == -1) {
-            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
-        } else {
-            auto & buf = buf_scratch[i];
-            last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
-        }
-
-        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
-        }
-
-        buf_last = i;
-#else
-        (void) i;
-        (void) ctx;
-#endif
-    }
-
-    size_t get_buf_max_mem(int i) const {
-#if defined(GPTNEOX_USE_SCRATCH)
-        return buf_max_size[i];
-#else
-        (void) i;
-        return 0;
-#endif
-    }
-};
-
-template <typename T>
-static T checked_mul(T a, T b) {
-    T ret = a * b;
-    if (a != 0 && ret / a != b) {
-        throw format("overflow multiplying %llu * %llu",
-                     (unsigned long long) a, (unsigned long long) b);
-    }
-    return ret;
-}
-
-static size_t checked_div(size_t a, size_t b) {
-    if (b == 0 || a % b != 0) {
-        throw format("error dividing %zu / %zu", a, b);
-    }
-    return a / b;
-}
-
-static std::string gptneox_format_tensor_shape(const std::vector<uint32_t> & ne) {
-    char buf[256];
-    snprintf(buf, sizeof(buf), "%5u", ne.at(0));
-    for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
-    }
-    return buf;
-}
-
-static size_t gptneox_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
-    size_t size = ggml_type_size(type);
-    for (uint32_t dim : ne) {
-        size = checked_mul<size_t>(size, dim);
-    }
-    return size / ggml_blck_size(type);
-}
-
-struct gptneox_load_tensor_shard {
-    std::vector<uint32_t> ne;
-    size_t size;
-    enum ggml_type type;
-    size_t file_idx;
-    size_t file_off;
-
-    void calc_size() {
-        size = gptneox_calc_tensor_size(ne, type);
-    }
-};
-
-enum gptneox_split_type {
-    SPLIT_NONE,
-    SPLIT_BY_COLUMNS,
-    SPLIT_BY_ROWS
-};
-
-struct gptneox_load_tensor {
-    std::vector<gptneox_load_tensor_shard> shards;
-
-    std::string name;
-    enum ggml_type type = GGML_TYPE_F32;
-    gptneox_split_type split_type = SPLIT_NONE;
-    std::vector<uint32_t> ne;
-    size_t size;
-    struct ggml_tensor * ggml_tensor = NULL;
-    uint8_t * data;
-
-    gptneox_load_tensor(const std::string & name) : name(name) {}
-
-    void calc_all() {
-        calc_type();
-        calc_split_type();
-        calc_ne();
-        calc_size();
-    }
-
-    void calc_type() {
-        const auto & first_shard = shards.at(0);
-        for (const auto & shard : shards) {
-            if (shard.type != first_shard.type) {
-                throw format("inconsistent tensor shard type in '%s'", name.c_str());
-            }
-        }
-        type = first_shard.type;
-    }
-
-    void calc_split_type() {
-        if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
-            shards.size() == 1) { // only one file?
-            split_type = SPLIT_NONE;
-        } else if (name.find("tok_embeddings.") == 0 ||
-            name.find(".attention.wo.weight") != std::string::npos ||
-            name.find(".feed_forward.w2.weight") != std::string::npos) {
-            split_type = SPLIT_BY_COLUMNS;
-        } else {
-            split_type = SPLIT_BY_ROWS;
-        }
-    }
-
-    void calc_ne() {
-        const auto & first_shard = shards.at(0);
-        for (const auto & shard : shards) {
-            if (shard.ne != first_shard.ne) {
-                throw format("inconsistent tensor shard shape in '%s': first was %s, other was %s",
-                             name.c_str(), gptneox_format_tensor_shape(first_shard.ne).c_str(), gptneox_format_tensor_shape(shard.ne).c_str());
-            }
-        }
-        ne = first_shard.ne;
-        GPTNEOX_ASSERT(shards.size() <= UINT32_MAX);
-        uint32_t n_shards = (uint32_t) shards.size();
-        switch (split_type) {
-            case SPLIT_NONE:
-                ne = first_shard.ne;
-                break;
-            case SPLIT_BY_COLUMNS:
-                ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
-                      first_shard.ne[1]};
-                break;
-            case SPLIT_BY_ROWS:
-                ne = {first_shard.ne[0],
-                      checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
-                break;
-        }
-    }
-
-    void calc_size() {
-        size = gptneox_calc_tensor_size(ne, type);
-    }
-};
-
-struct gptneox_load_tensors_map {
-    // tensors is kept in a separate vector to preserve file order
-    std::vector<gptneox_load_tensor> tensors;
-    std::unordered_map<std::string, size_t> name_to_idx;
-};
-
-enum gptneox_file_version {
-    GPTNEOX_FILE_VERSION_GGML,
-    GPTNEOX_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
-    GPTNEOX_FILE_VERSION_GGJT_V1, // added padding
-};
-
-struct gptneox_file_loader {
-    gptneox_file file;
-    gptneox_file_version file_version;
-    gptneox_hparams hparams;
-    gptneox_vocab vocab;
-
-    gptneox_file_loader(const char * fname, size_t file_idx, gptneox_load_tensors_map & tensors_map)
-        : file(fname, "rb") {
-        fprintf(stderr, "gptneox.cpp: loading model from %s\n", fname);
-        read_magic();
-        read_hparams();
-        read_vocab();
-        read_tensor_metadata(file_idx, tensors_map);
-    }
-    void read_magic() {
-        uint32_t magic = file.read_u32();
-        uint32_t version = 0;
-
-        if (magic != 'ggml') {
-            version = file.read_u32();
-        }
-
-        if (magic == 'ggml' && version == 0) {
-            file_version = GPTNEOX_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
-            file_version = GPTNEOX_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
-            file_version = GPTNEOX_FILE_VERSION_GGJT_V1;
-        } else {
-            throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
-                         magic, version);
-        }
-    }
-    void read_hparams() {
-        hparams.n_vocab = file.read_u32();
-        hparams.n_ctx = file.read_u32();
-        hparams.n_embd = file.read_u32();
-        hparams.n_head = file.read_u32();
-        hparams.n_layer = file.read_u32();
-        hparams.n_rot = file.read_u32();
-        hparams.use_parallel_residual = file.read_u32();
-        hparams.ftype = (enum gptneox_ftype) file.read_u32();
-    }
-    void read_vocab() {
-        vocab.id_to_token.resize(hparams.n_vocab);
-
-        for (uint32_t i = 0; i < hparams.n_vocab; i++) {
-            uint32_t len = file.read_u32();
-            std::string word = file.read_string(len);
-
-            float score = 0.0f;
-            // TODO: Implement scores in gptneox
-            /*if (file_version >= GPTNEOX_FILE_VERSION_GGMF_V1) {
-                file.read_raw(&score, sizeof(score));
-            }*/
-
-            vocab.token_to_id[word] = i;
-
-            auto & tok_score = vocab.id_to_token[i];
-            tok_score.tok = std::move(word);
-            tok_score.score = score;
-        }
-    }
-    void read_tensor_metadata(size_t file_idx, gptneox_load_tensors_map & tensors_map) {
-        while (file.tell() < file.size) {
-            gptneox_load_tensor_shard shard;
-            uint32_t n_dims = file.read_u32();
-            uint32_t name_len = file.read_u32();
-            shard.type = (enum ggml_type) file.read_u32();
-            shard.ne.resize(n_dims);
-            file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
-            std::string name = file.read_string(name_len);
-            if (n_dims < 1 || n_dims > 2) {
-                throw format("gptneox.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
-            }
-            switch (shard.type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
-                case GGML_TYPE_Q4_1:
-                case GGML_TYPE_Q4_2:
-                case GGML_TYPE_Q5_0:
-                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
-                    break;
-                default: {
-                    throw format("unrecognized tensor type %u\n", shard.type);
-                }
-            }
-
-            if (file_version >= GPTNEOX_FILE_VERSION_GGJT_V1) {
-                // skip to the next multiple of 32 bytes
-                file.seek(-file.tell() & 31, SEEK_CUR);
-            }
-            shard.file_idx = file_idx;
-            shard.file_off = file.tell();
-
-            shard.calc_size();
-            file.seek(shard.size, SEEK_CUR);
-
-            auto it = tensors_map.name_to_idx.find(name);
-            size_t idx;
-            if (it != tensors_map.name_to_idx.end()) {
-                idx = it->second;
-            } else {
-                tensors_map.tensors.emplace_back(name);
-                idx = tensors_map.tensors.size() - 1;
-                tensors_map.name_to_idx.emplace(name, idx);
-            }
-            tensors_map.tensors.at(idx).shards.push_back(shard);
-        }
-    }
-};
-
-struct gptneox_file_saver {
-    gptneox_file file;
-    gptneox_file_loader * any_file_loader;
-    gptneox_file_saver(const char * fname, gptneox_file_loader * any_file_loader, enum gptneox_ftype new_ftype)
-        : file(fname, "wb"), any_file_loader(any_file_loader) {
-        fprintf(stderr, "gptneox.cpp: saving model to %s\n", fname);
-        write_magic();
-        write_hparams(new_ftype);
-        write_vocab();
-    }
-    void write_magic() {
-        file.write_u32('ggjt'); // magic
-        file.write_u32(1); // version
-    }
-    void write_hparams(enum gptneox_ftype new_ftype) {
-        const gptneox_hparams & hparams = any_file_loader->hparams;
-        file.write_u32(hparams.n_vocab);
-        file.write_u32(hparams.n_ctx);
-        file.write_u32(hparams.n_embd);
-        file.write_u32(hparams.n_head);
-        file.write_u32(hparams.n_layer);
-        file.write_u32(hparams.n_rot);
-        file.write_u32(hparams.use_parallel_residual);
-        file.write_u32(new_ftype);
-    }
-    void write_vocab() {
-        if (any_file_loader->file_version == GPTNEOX_FILE_VERSION_GGML) {
-            fprintf(stderr, "gptneox.cpp: WARNING: input is an old file that doesn't have scores; will add dummy scores\n");
-        }
-        uint32_t n_vocab = any_file_loader->hparams.n_vocab;
-        for (uint32_t i = 0; i < n_vocab; i++) {
-            const auto & token_score = any_file_loader->vocab.id_to_token.at(i);
-            file.write_u32((uint32_t) token_score.tok.size());
-            file.write_raw(token_score.tok.data(), token_score.tok.size());
-            // TODO: Implement scores in gptneox?
-            //file.write_raw(&token_score.score, sizeof(token_score.score));
-        }
-    }
-    void write_tensor(gptneox_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
-        switch (new_type) {
-            case GGML_TYPE_F32:
-            case GGML_TYPE_F16:
-            case GGML_TYPE_Q4_0:
-            case GGML_TYPE_Q4_1:
-            case GGML_TYPE_Q4_2:
-            case GGML_TYPE_Q5_0:
-            case GGML_TYPE_Q5_1:
-            case GGML_TYPE_Q8_0:
-                break;
-            default: GPTNEOX_ASSERT(false);
-        }
-        file.write_u32((uint32_t) tensor.ne.size());
-        file.write_u32((uint32_t) tensor.name.size());
-        file.write_u32(new_type);
-        file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
-        file.write_raw(tensor.name.data(), tensor.name.size());
-        file.seek(-file.tell() & 31, SEEK_CUR);
-        GPTNEOX_ASSERT(new_size == gptneox_calc_tensor_size(tensor.ne, new_type));
-        file.write_raw(new_data, new_size);
-    }
-};
-
-struct gptneox_model_loader {
-    std::vector<std::unique_ptr<gptneox_file_loader>> file_loaders;
-    gptneox_load_tensors_map tensors_map;
-    bool use_mmap;
-    size_t num_ggml_tensors_created = 0;
-    struct ggml_context * ggml_ctx = NULL;
-    std::unique_ptr<gptneox_mmap> mapping;
-
-    gptneox_model_loader(const std::string & fname_base, bool use_mmap, bool vocab_only) {
-        auto first_file = new gptneox_file_loader(fname_base.c_str(), 0, tensors_map);
-        file_loaders.emplace_back(first_file);
-        uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
-        for (uint32_t i = 1; i < n_parts; i++) {
-            std::string fname = fname_base + "." + std::to_string(i);
-            auto ith_file = new gptneox_file_loader(fname.c_str(), i, tensors_map);
-            file_loaders.emplace_back(ith_file);
-            if (ith_file->hparams != first_file->hparams) {
-                throw format("gptneox.cpp: hparams inconsistent between files");
-            }
-        }
-        if (!gptneox_mmap::SUPPORTED) {
-            use_mmap = false;
-        }
-        if (use_mmap && alignment_prevents_mmap()) {
-            fprintf(stderr, "gptneox.cpp: can't use mmap because tensors are not aligned; convert to new format to avoid this\n");
-            use_mmap = false;
-        }
-        this->use_mmap = use_mmap;
-        for (gptneox_load_tensor & lt : tensors_map.tensors) {
-            lt.calc_all();
-        }
-    }
-
-    bool alignment_prevents_mmap() {
-        for (const gptneox_load_tensor & lt : tensors_map.tensors) {
-            for (const gptneox_load_tensor_shard & shard : lt.shards) {
-                if (shard.file_off & 3) {
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    uint32_t guess_n_parts() const {
-        auto it = tensors_map.name_to_idx.find("gpt_neox.embed_in.weight");
-        if (it == tensors_map.name_to_idx.end()) {
-            throw std::string("missing gpt_neox.embed_in.weight");
-        }
-        const gptneox_load_tensor & lt = tensors_map.tensors.at(it->second);
-        return file_loaders.at(0)->hparams.n_embd / lt.shards.at(0).ne.at(0);
-    }
-
-    void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
-        *ctx_size_p = *mmapped_size_p = 0;
-        for (const gptneox_load_tensor & lt : tensors_map.tensors) {
-            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
-        }
-    }
-
-    struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
-        auto it = tensors_map.name_to_idx.find(name);
-        if (it == tensors_map.name_to_idx.end()) {
-            throw format("gptneox.cpp: tensor '%s' is missing from model", name.c_str());
-        }
-        gptneox_load_tensor & lt = tensors_map.tensors.at(it->second);
-        if (lt.ne != ne) {
-            throw format("gptneox.cpp: tensor '%s' has wrong shape; expected %s, got %s",
-                         name.c_str(), gptneox_format_tensor_shape(ne).c_str(), gptneox_format_tensor_shape(lt.ne).c_str());
-        }
-
-        return get_tensor_for(lt);
-    }
-
-    struct ggml_tensor * get_tensor_for(gptneox_load_tensor & lt) {
-        struct ggml_tensor * tensor;
-        if (lt.ne.size() == 2) {
-            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
-        } else {
-            GPTNEOX_ASSERT(lt.ne.size() == 1);
-            tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
-        }
-        GPTNEOX_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
-        lt.ggml_tensor = tensor;
-        num_ggml_tensors_created++;
-        return tensor;
-    }
-
-    void done_getting_tensors() {
-        if (num_ggml_tensors_created != tensors_map.tensors.size()) {
-            throw std::string("gptneox.cpp: file contained more tensors than expected");
-        }
-    }
-
-    void load_all_data(gptneox_progress_callback progress_callback, void *  progress_callback_user_data, gptneox_mlock * lmlock) {
-        size_t data_size = 0;
-        for (const gptneox_load_tensor & lt : tensors_map.tensors) {
-            data_size += lt.size;
-        }
-
-        if (use_mmap) {
-            mapping.reset(new gptneox_mmap(&file_loaders.at(0)->file));
-            if (!lmlock) {
-                // Don't call the callback since the actual loading will be lazy
-                // and we can't measure it.
-                progress_callback = NULL;
-            }
-            if (lmlock) {
-                lmlock->init(mapping->addr);
-            }
-        }
-
-        size_t done_size = 0;
-        for (gptneox_load_tensor & lt : tensors_map.tensors) {
-            if (progress_callback) {
-                progress_callback((float) done_size / data_size, progress_callback_user_data);
-            }
-            GPTNEOX_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
-            lt.data = (uint8_t *) lt.ggml_tensor->data;
-            load_data_for(lt);
-            lt.ggml_tensor->data = lt.data;
-            done_size += lt.size;
-            if (use_mmap && lmlock) {
-                lmlock->grow_to(done_size);
-            }
-        }
-        if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
-        }
-    }
-
-    void load_data_for(gptneox_load_tensor & lt) {
-        if (use_mmap) {
-            GPTNEOX_ASSERT(lt.shards.size() == 1);
-            lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
-        } else if (lt.split_type == SPLIT_NONE) {
-            gptneox_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
-            file.seek(lt.shards.at(0).file_off, SEEK_SET);
-            file.read_raw(lt.data, lt.size);
-        } else if (lt.split_type == SPLIT_BY_ROWS) {
-            size_t offset = 0;
-            for (gptneox_load_tensor_shard & shard : lt.shards) {
-                gptneox_file & file = file_loaders.at(shard.file_idx)->file;
-                file.seek(shard.file_off, SEEK_SET);
-                file.read_raw(lt.data + offset, shard.size);
-                offset += shard.size;
-            }
-            GPTNEOX_ASSERT(offset == lt.size);
-        } else if (lt.split_type == SPLIT_BY_COLUMNS) {
-            // Let's load the data into temporary buffers to ensure the OS performs large loads.
-            std::vector<gptneox_buffer> tmp_bufs;
-            tmp_bufs.resize(lt.shards.size());
-            for (size_t i = 0; i < lt.shards.size(); i++) {
-                gptneox_load_tensor_shard & shard = lt.shards.at(i);
-                gptneox_file & file = file_loaders.at(shard.file_idx)->file;
-                file.seek(shard.file_off, SEEK_SET);
-                tmp_bufs.at(i).resize(shard.size);
-                file.read_raw(tmp_bufs.at(i).addr, shard.size);
-            }
-            // Then reshape.
-            size_t num_rows = lt.ne.at(1);
-            size_t per_shard_row_size = lt.shards.at(0).size / num_rows;
-            size_t out_offset = 0;
-            for (size_t row = 0; row < num_rows; row++) {
-                for (gptneox_buffer & tmp_buf : tmp_bufs) {
-                    memcpy(lt.data + out_offset,
-                           tmp_buf.addr + row * per_shard_row_size,
-                           per_shard_row_size);
-                    out_offset += per_shard_row_size;
-                }
-            }
-            GPTNEOX_ASSERT(out_offset == lt.size);
-        }
-        if (0) {
-            print_checksum(lt);
-        }
-    }
-
-    static void print_checksum(gptneox_load_tensor & lt) {
-        uint32_t sum = 0;
-        for (size_t i = 0; i < lt.size; i++) {
-            uint8_t byte = lt.data[i];
-            sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
-        }
-        fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
-                gptneox_format_tensor_shape(lt.ne).c_str(), lt.size);
-    }
-
-};
-
-
-//
-// kv cache
-//
-
-static bool kv_cache_init(
-        const struct gptneox_hparams & hparams,
-             struct gptneox_kv_cache & cache,
-                         ggml_type   wtype,
-                               int   n_ctx) {
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-
-    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
-    const int64_t n_elements = n_embd*n_mem;
-
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MiB);
-
-    struct ggml_init_params params;
-    params.mem_size   = cache.buf.size;
-    params.mem_buffer = cache.buf.addr;
-    params.no_alloc   = false;
-
-    cache.ctx = ggml_init(params);
-
-    if (!cache.ctx) {
-        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
-    }
-
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-
-    return true;
-}
-
-struct gptneox_context_params gptneox_context_default_params() {
-    struct gptneox_context_params result = {
-        /*.n_ctx                       =*/ 512,
-        /*.n_parts                     =*/ -1,
-        /*.seed                        =*/ 0,
-        /*.f16_kv                      =*/ false,
-        /*.logits_all                  =*/ false,
-        /*.vocab_only                  =*/ false,
-        /*.use_mmap                    =*/ true,
-        /*.use_mlock                   =*/ false,
-        /*.embedding                   =*/ false,
-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-    };
-
-    return result;
-}
-
-bool gptneox_mmap_supported() {
-    return gptneox_mmap::SUPPORTED;
-}
-
-bool gptneox_mlock_supported() {
-    return gptneox_mlock::SUPPORTED;
-}
-
-//
-// model loading
-//
-
-static const char *gptneox_file_version_name(gptneox_file_version version) {
-    switch (version) {
-        case GPTNEOX_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
-        case GPTNEOX_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
-        case GPTNEOX_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)";
-        default: GPTNEOX_ASSERT(false);
-    }
-}
-
-static const char *gptneox_ftype_name(enum gptneox_ftype ftype) {
-    switch (ftype) {
-        case GPTNEOX_FTYPE_ALL_F32:     return "all F32";
-        case GPTNEOX_FTYPE_MOSTLY_F16:  return "mostly F16";
-        case GPTNEOX_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
-        case GPTNEOX_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
-        case GPTNEOX_FTYPE_MOSTLY_Q4_1_SOME_F16:
-                                      return "mostly Q4_1, some F16";
-        case GPTNEOX_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
-        //case GPTNEOX_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
-        case GPTNEOX_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
-        case GPTNEOX_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
-        case GPTNEOX_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
-        default:                      return "unknown, may not work";
-    }
-}
-
-static const char *gptneox_model_type_name(e_model type) {
-    switch (type) {
-        case MODEL_3B: return "3B";
-        case MODEL_7B: return "7B";
-        case MODEL_12B: return "12B";
-        case MODEL_20B: return "20B";
-        case MODEL_UNKNOWN: return "UNKNOWN";
-        default: GPTNEOX_ASSERT(false);
-    }
-}
-
-static void gptneox_model_load_internal(
-        const std::string & fname,
-        gptneox_context & lctx,
-        int n_ctx,
-        ggml_type memory_type,
-        bool use_mmap,
-        bool use_mlock,
-        bool vocab_only,
-        gptneox_progress_callback progress_callback,
-        void * progress_callback_user_data) {
-
-    lctx.t_start_us = ggml_time_us();
-
-    std::unique_ptr<gptneox_model_loader> ml(new gptneox_model_loader(fname, use_mmap, vocab_only));
-
-    lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
-    auto & model = lctx.model;
-    model.hparams = ml->file_loaders.at(0)->hparams;
-    gptneox_file_version file_version = ml->file_loaders.at(0)->file_version;
-    auto & hparams = model.hparams;
-    
-    {
-        switch (hparams.n_layer) {
-            case 16: {
-                if (hparams.n_embd < 6144) {
-                    model.type = e_model::MODEL_3B;
-                } else {
-                    model.type = e_model::MODEL_7B;
-                }
-                break;
-            }
-            // # <RedPajama>: we extend the model type settings for RedPajama models.  
-            case 32:{
-                if (hparams.n_embd == 2560) {
-                    model.type = e_model::MODEL_3B;
-                } else if (hparams.n_embd == 4096) {
-                    model.type = e_model::MODEL_7B;
-                }
-                else {
-                    model.type = e_model::MODEL_UNKNOWN;
-                }
-                break;
-            }
-            case 36: model.type = e_model::MODEL_12B; break;
-            case 44: model.type = e_model::MODEL_20B; break;
-        }
-
-        hparams.n_ctx = n_ctx;
-    }
-
-    {
-        fprintf(stderr, "%s: format     = %s\n",  __func__, gptneox_file_version_name(file_version));
-        fprintf(stderr, "%s: n_vocab    = %u\n",  __func__, hparams.n_vocab);
-        fprintf(stderr, "%s: n_ctx      = %u\n",  __func__, hparams.n_ctx);
-        fprintf(stderr, "%s: n_embd     = %u\n",  __func__, hparams.n_embd);
-        fprintf(stderr, "%s: n_head     = %u\n",  __func__, hparams.n_head);
-        fprintf(stderr, "%s: n_layer    = %u\n",  __func__, hparams.n_layer);
-        fprintf(stderr, "%s: n_rot      = %u\n",  __func__, hparams.n_rot);
-        fprintf(stderr, "%s: use_parallel_residual = %d\n", __func__, hparams.use_parallel_residual);
-        fprintf(stderr, "%s: ftype      = %u (%s)\n", __func__, hparams.ftype, gptneox_ftype_name(hparams.ftype));
-        fprintf(stderr, "%s: n_parts    = %zu\n", __func__, ml->file_loaders.size());
-        fprintf(stderr, "%s: model size = %s\n",  __func__, gptneox_model_type_name(model.type));
-    }
-
-    if (vocab_only) {
-        return;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size, mmapped_size;
-    ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %6.2f KiB\n", __func__, ctx_size/1024.0);
-
-    // print memory requirements
-    {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-            ctx_size +
-            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
-
-        // this is the memory required by one gptneox_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
-
-        fprintf(stderr, "%s: mem required  = %7.2f MiB (+ %7.2f MiB per state)\n", __func__,
-                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
-    }
-
-    // create the ggml context
-    {
-        lctx.model.buf.resize(ctx_size);
-        if (use_mlock) {
-            lctx.model.mlock_buf.init(lctx.model.buf.addr);
-            lctx.model.mlock_buf.grow_to(lctx.model.buf.size);
-        }
-
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ lctx.model.buf.size,
-            /*.mem_buffer =*/ lctx.model.buf.addr,
-            /*.no_alloc   =*/ ml->use_mmap,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            throw format("ggml_init() failed");
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const uint32_t n_embd  = hparams.n_embd;
-        const uint32_t n_layer = hparams.n_layer;
-        const uint32_t n_vocab = hparams.n_vocab;
-
-        ml->ggml_ctx = ctx;
-
-        model.wte       = ml->get_tensor("gpt_neox.embed_in.weight",            {n_embd, n_vocab});
-        model.ln_f_g    = ml->get_tensor("gpt_neox.final_layer_norm.weight",    {n_embd});
-        model.ln_f_b    = ml->get_tensor("gpt_neox.final_layer_norm.bias",      {n_embd});
-        model.lmh_g     = ml->get_tensor("embed_out.weight",                    {n_embd, n_vocab});
-
-        model.layers.resize(n_layer);
-        for (uint32_t i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            std::string layers_i = "gpt_neox.layers." + std::to_string(i);
-
-            layer.ln_attn_g = ml->get_tensor(layers_i + ".input_layernorm.weight", {n_embd});
-            layer.ln_attn_b = ml->get_tensor(layers_i + ".input_layernorm.bias", {n_embd});
-
-            layer.c_attn_attn_w = ml->get_tensor(layers_i + ".attention.query_key_value.weight", {n_embd, n_embd * 3});
-            layer.c_attn_attn_b = ml->get_tensor(layers_i + ".attention.query_key_value.bias", {n_embd * 3});
-            layer.c_attn_proj_w = ml->get_tensor(layers_i + ".attention.dense.weight", {n_embd, n_embd});
-            layer.c_attn_proj_b = ml->get_tensor(layers_i + ".attention.dense.bias", {n_embd});
-
-            layer.ln_ff_g = ml->get_tensor(layers_i + ".post_attention_layernorm.weight", {n_embd});
-            layer.ln_ff_b = ml->get_tensor(layers_i + ".post_attention_layernorm.bias", {n_embd});
-
-            layer.c_mlp_fc_w =   ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.weight", {n_embd,   n_embd * 4});
-            layer.c_mlp_fc_b =   ml->get_tensor(layers_i + ".mlp.dense_h_to_4h.bias",   {n_embd * 4});
-            layer.c_mlp_proj_w = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.weight", {n_embd * 4,   n_embd});
-            layer.c_mlp_proj_b = ml->get_tensor(layers_i + ".mlp.dense_4h_to_h.bias",   {n_embd});
-        }
-    }
-
-    ml->done_getting_tensors();
-
-    // populate `tensors_by_name`
-    for (gptneox_load_tensor & lt : ml->tensors_map.tensors) {
-        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
-    }
-
-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
-
-    model.mapping = std::move(ml->mapping);
-
-    // loading time will be recalculate after the first eval, so
-    // we take page faults deferred by mmap() into consideration
-    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
-}
-
-static bool gptneox_model_load(
-        const std::string & fname,
-        gptneox_context & lctx,
-        int n_ctx,
-        ggml_type memory_type,
-        bool use_mmap,
-        bool use_mlock,
-        bool vocab_only,
-        gptneox_progress_callback progress_callback,
-        void *progress_callback_user_data) {
-    try {
-        gptneox_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
-                                  vocab_only, progress_callback, progress_callback_user_data);
-        return true;
-    } catch (const std::string & err) {
-        fprintf(stderr, "error loading model: %s\n", err.c_str());
-        return false;
-    }
-}
-
-// evaluate the transformer
-//
-//   - lctx:      llama context
-//   - tokens:    new batch of tokens to process
-//   - n_past:    the context size so far
-//   - n_threads: number of threads to use
-//
-static bool gptneox_eval_internal(
-        gptneox_context & lctx,
-    const gptneox_token * tokens,
-            const int   n_tokens,
-            const int   n_past,
-            const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
-
-    const int N = n_tokens;
-
-    const auto & model   = lctx.model;
-    const auto & hparams = model.hparams;
-
-    auto & kv_self = model.kv_self;
-
-    GPTNEOX_ASSERT(!!kv_self.ctx);
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-    const int n_rot   = hparams.n_rot;
-
-    auto & mem_per_token = lctx.mem_per_token;
-    auto & buf_compute   = lctx.buf_compute;
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_compute.size,
-        /*.mem_buffer =*/ buf_compute.addr,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    // for big prompts, if BLAS is enabled, it is better to use only one thread
-    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_cublas() ? 1 : n_threads;
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
-
-    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        lctx.use_buf(ctx0, 0);
-
-        // input norm
-        {
-            cur = ggml_norm(ctx0, inpL);
-
-            // cur = ln_attn_g*cur + ln_attn_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_attn_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_attn_b, cur));
-        }
-
-        // self-attention
-        {
-            // attn
-            // [3*n_embd, n_embd] - model.layers[il].c_attn_attn_w
-            // [3*n_embd,      1] - model.layers[il].c_attn_attn_b
-            // [  n_embd,      N] - cur (in)
-            // [3*n_embd,      N] - cur (out)
-            //
-            // cur = attn_w*cur + attn_b
-            // [3*n_embd, N]
-            {
-                cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_attn_w, cur);
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
-                                    model.layers[il].c_attn_attn_b, cur),
-                        cur);
-            }
-             
-            // Split QKV and make contiguous
-            struct ggml_tensor * Qcur = ggml_view_3d(ctx0, cur,
-                                            n_embd/n_head,
-                                            n_head,
-                                            N,
-                                            ggml_element_size(cur) * 3 * n_embd/n_head,
-                                            ggml_element_size(cur) * 3 * n_embd,
-                                            ggml_element_size(cur) * n_embd/n_head * 0);
-            struct ggml_tensor * Kcur = ggml_view_3d(ctx0, cur,
-                                            n_embd/n_head,
-                                            n_head,
-                                            N,
-                                            ggml_element_size(cur) * 3 * n_embd/n_head,
-                                            ggml_element_size(cur) * 3 * n_embd,
-                                            ggml_element_size(cur) * n_embd/n_head * 1);
-            struct ggml_tensor * Vcur = ggml_view_3d(ctx0, cur,
-                                            n_embd/n_head,
-                                            n_head,
-                                            N,
-                                            ggml_element_size(cur) * 3 * n_embd/n_head,
-                                            ggml_element_size(cur) * 3 * n_embd,
-                                            ggml_element_size(cur) * n_embd/n_head * 2);
-            // TODO: Flatten without copying, or see if non-contiguous can be used for any of QKV.
-            Qcur = ggml_cpy(ctx0, Qcur,
-                        ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
-            Kcur = ggml_cpy(ctx0, Kcur,
-                        ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
-            Vcur = ggml_cpy(ctx0, Vcur,
-                        ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N));
-            
-            // MARK: gptneox RoPE Q and K, before cache
-            // Bit 2 for gptneox style (2)
-            // Bit 1 is zero for dont skip n_past +(0), use (2+1) = (3) if rope is applied to cache of k (after cache only)
-            Qcur = ggml_rope(ctx0, Qcur, n_past, n_rot, 2);
-            Kcur = ggml_rope(ctx0, Kcur, n_past, n_rot, 2); //3);
-
-            // store key and value to memory, not required if prompt if only a single token (not practical or likely)
-            //if (N >= 1) {
-                // Each entry in kv_self has byte size of (ggml_element_size * n_embd * n_ctx * n_layer)
-                Vcur = ggml_view_2d(ctx0, Vcur,
-                            n_embd,
-                            N,
-                            ggml_element_size(Vcur) * n_embd,
-                            0);
-                Vcur = ggml_transpose(ctx0, Vcur);
-            
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k,
-                                            n_embd * N, // num elements in current context (up to n_embd*n_ctx but usually less)
-                                            ggml_element_size(kv_self.k) * n_embd * (il * n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v,
-                                            N,
-                                            n_embd,
-                                            ggml_element_size(kv_self.v) * n_ctx,
-                                            ggml_element_size(kv_self.v) * ((il * n_ctx * n_embd) + n_past));
-            
-                // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            //}
-            
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        Qcur,
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kv_self.k,
-                                (n_past + N) * n_embd,
-                                ggml_element_size(kv_self.k) * il * n_ctx * n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // K * Q
-            // Will use internally ggml_compute_forward_mul_mat_f16_f32 because K is f16 (cache) and Q is f32 (from q4_0)
-            // Outputs [N, N, H, B], so it seems like this is correct for "scores"
-            // K is internally transposed by ggml_mul_mat
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ,
-                                                ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
-            // KQ_masked = mask_past(KQ_scaled)
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            // KQ = soft_max(KQ_masked)
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            struct ggml_tensor * V_trans = ggml_view_3d(ctx0, kv_self.v,
-                                                n_past + N,
-                                                n_embd/n_head,
-                                                n_head,
-                                                ggml_element_size(kv_self.v) * n_ctx,
-                                                ggml_element_size(kv_self.v) * n_ctx * n_embd/n_head,
-                                                ggml_element_size(kv_self.v) * il * n_ctx * n_embd);
-
-            // KQV = transpose(V) * KQ_soft_max
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            cur = ggml_cpy(ctx0, KQV_merged,
-                        ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-
-            // projection (first weight)
-            cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_proj_w, cur);
-
-            // projection (then bias)
-            cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
-        }
-
-        lctx.use_buf(ctx0, 1);
-        
-        if (hparams.use_parallel_residual == 1) {
-            //printf("use_parallel_residual == 1\n");
-            
-            // This is independent of the self-attention result, so it could be done in parallel to the self-attention
-            struct ggml_tensor * outAttn = cur;
-
-            // post attention layer norm
-            {
-                cur = ggml_norm(ctx0, inpL);
-
-                // cur = ln_attn_g*inpFF + ln_attn_b
-                cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_ff_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_ff_b, cur));
-            }
-
-
-            // feed-forward network
-            {
-                // note here we pass inpFF instead of cur
-                cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, cur);
-
-                cur = ggml_add(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                            cur);
-
-                // GELU activation
-                cur = ggml_gelu(ctx0, cur);
-
-                // projection
-                // cur = proj_w*inpFF + proj_b
-                cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur);
-
-                cur = ggml_add(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                            cur);
-            }
-            //# pseudocode:
-            //# x = x + attn(ln1(x)) + mlp(ln2(x))
-            // inpL = inpL + outAttn + cur
-            cur = ggml_add(ctx0, outAttn, cur);
-            inpL = ggml_add(ctx0, inpL, cur);
-        } else if (hparams.use_parallel_residual == 0) {
-            //printf("use_parallel_residual == 0\n");
-            
-            // This takes the self-attention residual output as input to Feedforward
-            struct ggml_tensor * outAttn = cur;
-            struct ggml_tensor * inpFF = ggml_add(ctx0, outAttn, inpL);
-
-            // post attention layer norm
-            {
-                cur = ggml_norm(ctx0, inpFF);
-
-                // inpFF = ln_attn_g*inpFF + ln_attn_b
-                cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_ff_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_ff_b, cur));
-            }
-
-            // feed-forward network
-            {
-                // note here we pass inpFF instead of cur
-                cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_fc_w, cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), cur);
-
-                cur = ggml_gelu(ctx0, cur);
-
-                cur = ggml_mul_mat(ctx0, model.layers[il].c_mlp_proj_w, cur);
-
-                cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), cur);
-            }
-
-            //# pseudocode:
-            //# x = x + attn(ln1(x)) (residual above as input to mlp)
-            //# x = x + mlp(ln2(x)) (residual after mlp aka inpL + cur)
-            //# <RedPajama>: we fixed a small issue in the gptneox.cpp fork when setting use_parallel_residual to False;
-            inpL = ggml_add(ctx0, inpFF, cur);
-        } else {
-            printf("use_parallel_residual == %d\n", hparams.use_parallel_residual);
-            assert(0);
-        }
-    }
-
-    lctx.use_buf(ctx0, 0);
-
-    // used at the end to optionally extract the embeddings
-    struct ggml_tensor * embeddings = NULL;
-
-    // norm
-    {
-        inpL = ggml_norm(ctx0, inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-
-        embeddings = inpL;
-    }
-
-    // lm_head
-    inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
-
-    lctx.use_buf(ctx0, -1);
-
-    // logits -> probs
-    //inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
-
-#ifdef GGML_PERF
-    // print timing information per ggml operation (for debugging purposes)
-    // requires GGML_PERF to be defined
-    ggml_graph_print(&gf);
-#endif
-
-    // plot the computation graph in dot format (for debugging purposes)
-    //if (n_past%100 == 0) {
-    //    ggml_graph_dump_dot(&gf, NULL, "llama.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // extract logits
-    {
-        auto & logits_out = lctx.logits;
-
-        if (lctx.logits_all) {
-            logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-        } else {
-            // return result for just the last token
-            logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-        }
-    }
-
-    // extract embeddings
-    if (lctx.embedding.size()) {
-        auto & embedding_out = lctx.embedding;
-
-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
-    }
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-
-#if 0
-    printf("\n%s: used_mem = %.3f MiB, scratch -- %.3f MiB %.3f MiB\n", __func__,
-            ggml_used_mem(ctx0)/1024.0/1024.0,
-            lctx.get_buf_max_mem(0)/1024.0/1024.0,
-            lctx.get_buf_max_mem(1)/1024.0/1024.0);
-#endif
-
-    ggml_free(ctx0);
-
-    // measure the performance only for the single-token evals
-    if (N == 1) {
-        lctx.t_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_eval++;
-    }
-    else if (N > 1) {
-        lctx.t_p_eval_us += ggml_time_us() - t_start_us;
-        lctx.n_p_eval += N;
-    }
-
-    return true;
-}
-
-//
-// tokenizer
-//
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
-
-struct gptneox_sp_symbol {
-    using index = int;
-    index prev;
-    index next;
-    const char * text;
-    size_t n;
-};
-
-struct gptneox_sp_bigram {
-    struct comparator {
-        bool operator()(gptneox_sp_bigram & l, gptneox_sp_bigram & r) {
-            return (l.score < r.score) || (l.score == r.score && l.left > r.left);
-        }
-    };
-    using queue_storage = std::vector<gptneox_sp_bigram>;
-    using queue = std::priority_queue<gptneox_sp_bigram, queue_storage, comparator>;
-    gptneox_sp_symbol::index left;
-    gptneox_sp_symbol::index right;
-    float score;
-    size_t size;
-};
-
-// original implementation:
-// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
-struct gptneox_tokenizer {
-    gptneox_tokenizer(const gptneox_vocab & vocab): vocab_(vocab) {}
-
-    void tokenize(const std::string & text, std::vector<gptneox_vocab::id> & output) {
-        // split string into utf8 chars
-        int index = 0;
-        size_t offs = 0;
-        while (offs < text.size()) {
-            gptneox_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
-            sym.text = text.c_str() + offs;
-            sym.n = char_len;
-            offs += char_len;
-            sym.prev = index - 1;
-            sym.next = offs == text.size() ? -1 : index + 1;
-            index++;
-            symbols_.emplace_back(std::move(sym));
-        }
-
-        // seed the work queue with all possible 2-character tokens.
-        for (size_t i = 1; i < symbols_.size(); ++i) {
-            try_add_bigram(i - 1, i);
-        }
-
-        // keep substituting the highest frequency pairs for as long as we can.
-        while (!work_queue_.empty()) {
-            auto bigram = work_queue_.top();
-            work_queue_.pop();
-
-            auto & left_sym = symbols_[bigram.left];
-            auto & right_sym = symbols_[bigram.right];
-
-            // if one of the symbols already got merged, skip it.
-            if (left_sym.n == 0 || right_sym.n == 0 ||
-                left_sym.n + right_sym.n != bigram.size) {
-                continue;
-            }
-
-            // merge the right sym into the left one
-            left_sym.n += right_sym.n;
-            right_sym.n = 0;
-
-            //printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
-
-            // remove the right sym from the chain
-            left_sym.next = right_sym.next;
-            if (right_sym.next >= 0) {
-                symbols_[right_sym.next].prev = bigram.left;
-            }
-
-            // find more substitutions
-            try_add_bigram(left_sym.prev, bigram.left);
-            try_add_bigram(bigram.left, left_sym.next);
-        }
-
-        for (int i = 0; i != -1; i = symbols_[i].next) {
-            auto & symbol = symbols_[i];
-            auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
-
-            if (token == vocab_.token_to_id.end()) {
-                // output any symbols that did not form tokens as bytes.
-                for (int j = 0; j < (int) symbol.n; ++j) {
-                    gptneox_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
-                    output.push_back(token_id);
-                }
-            } else {
-                output.push_back((*token).second);
-            }
-        }
-    }
-
-private:
-    void try_add_bigram(int left, int right) {
-        if (left == -1 || right == -1) {
-            return;
-        }
-
-        const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
-        auto token = vocab_.token_to_id.find(text);
-
-        if (token == vocab_.token_to_id.end()) {
-            return;
-        }
-
-        if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
-            return;
-        }
-
-        const auto &tok_score = vocab_.id_to_token[(*token).second];
-
-        gptneox_sp_bigram bigram;
-        bigram.left = left;
-        bigram.right = right;
-        bigram.score = tok_score.score;
-        bigram.size = text.size();
-        work_queue_.push(bigram);
-    }
-
-    const gptneox_vocab & vocab_;
-    std::vector<gptneox_sp_symbol> symbols_;
-    gptneox_sp_bigram::queue work_queue_;
-};
-
-static std::vector<gptneox_vocab::id> gptneox_tokenize(const gptneox_vocab & vocab, const std::string & text, bool bos) {
-    gptneox_tokenizer tokenizer(vocab);
-    std::vector<gptneox_vocab::id> output;
-
-    if (text.size() == 0) {
-        return output;
-    }
-
-    if (bos) {
-        output.push_back(gptneox_token_bos());
-    }
-
-    tokenizer.tokenize(text, output);
-    return output;
-}
-
-//
-// sampling
-//
-
-void gptneox_sample_softmax(struct gptneox_context * ctx, gptneox_token_data_array * candidates) {
-    assert(candidates->size > 0);
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Sort the logits in descending order
-    if (!candidates->sorted) {
-        std::sort(candidates->data, candidates->data + candidates->size, [](const gptneox_token_data & a, const gptneox_token_data & b) {
-            return a.logit > b.logit;
-        });
-        candidates->sorted = true;
-    }
-
-    float max_l = candidates->data[0].logit;
-    float cum_sum = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float p = expf(candidates->data[i].logit - max_l);
-        candidates->data[i].p = p;
-        cum_sum += p;
-    }
-    for (size_t i = 0; i < candidates->size; ++i) {
-        candidates->data[i].p /= cum_sum;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_top_k(struct gptneox_context * ctx, gptneox_token_data_array * candidates, int k, size_t min_keep) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    k = std::max(k, (int) min_keep);
-    k = std::min(k, (int) candidates->size);
-
-    // Sort scores in descending order
-    if (!candidates->sorted) {
-        auto comp = [](const gptneox_token_data & a, const gptneox_token_data & b) {
-            return a.logit > b.logit;
-        };
-        if (k == (int) candidates->size) {
-            std::sort(candidates->data, candidates->data + candidates->size, comp);
-        } else {
-            std::partial_sort(candidates->data, candidates->data + k, candidates->data + candidates->size, comp);
-        }
-        candidates->sorted = true;
-    }
-    candidates->size = k;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_top_p(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep) {
-    if (p >= 1.0f) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    gptneox_sample_softmax(ctx, candidates);
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        cum_sum += candidates->data[i].p;
-
-        // Check if the running sum is greater than p or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the top-p tokens
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_tail_free(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float z, size_t min_keep) {
-    if (z >= 1.0f || candidates->size <= 2) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    gptneox_sample_softmax(nullptr, candidates);
-
-    // Compute the first and second derivatives
-    std::vector<float> first_derivatives(candidates->size - 1);
-    std::vector<float> second_derivatives(candidates->size - 2);
-
-    for (size_t i = 0; i < first_derivatives.size(); ++i) {
-        first_derivatives[i] = candidates->data[i].p - candidates->data[i + 1].p;
-    }
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
-    }
-
-    // Calculate absolute value of second derivatives
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        second_derivatives[i] = abs(second_derivatives[i]);
-    }
-
-    // Normalize the second derivatives
-    float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
-    for (float & value : second_derivatives) {
-        value /= second_derivatives_sum;
-    }
-
-    float cum_sum = 0.0f;
-    size_t last_idx = candidates->size;
-    for (size_t i = 0; i < second_derivatives.size(); ++i) {
-        cum_sum += second_derivatives[i];
-
-        // Check if the running sum is greater than z or if we have kept at least min_keep tokens
-        if (cum_sum > z && i >= min_keep) {
-            last_idx = i;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the tokens above the tail location
-    candidates->size = last_idx;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-
-void gptneox_sample_typical(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float p, size_t min_keep) {
-    // Reference implementation:
-    // https://github.com/huggingface/transformers/compare/main...cimeister:typical-sampling:typical-pr
-    if (p >= 1.0f) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Compute the softmax of logits and calculate entropy
-    gptneox_sample_softmax(nullptr, candidates);
-
-    float entropy = 0.0f;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        entropy += -candidates->data[i].p * logf(candidates->data[i].p);
-    }
-
-    // Compute the absolute difference between negative log probability and entropy for each candidate
-    std::vector<float> shifted_scores;
-    for (size_t i = 0; i < candidates->size; ++i) {
-        float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
-        shifted_scores.push_back(shifted_score);
-    }
-
-    // Sort tokens based on the shifted_scores and their corresponding indices
-    std::vector<size_t> indices(candidates->size);
-    std::iota(indices.begin(), indices.end(), 0);
-
-    std::sort(indices.begin(), indices.end(), [&](size_t a, size_t b) {
-        return shifted_scores[a] < shifted_scores[b];
-    });
-
-    // Compute the cumulative probabilities
-    float cum_sum = 0.0f;
-    size_t last_idx = indices.size();
-
-    for (size_t i = 0; i < indices.size(); ++i) {
-        size_t idx = indices[i];
-        cum_sum += candidates->data[idx].p;
-
-        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
-            last_idx = i + 1;
-            break;
-        }
-    }
-
-    // Resize the output vector to keep only the locally typical tokens
-    std::vector<gptneox_token_data> new_candidates;
-    for (size_t i = 0; i < last_idx; ++i) {
-        size_t idx = indices[i];
-        new_candidates.push_back(candidates->data[idx]);
-    }
-
-    // Replace the data in candidates with the new_candidates data
-    std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
-    candidates->size = new_candidates.size();
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_temperature(struct gptneox_context * ctx, gptneox_token_data_array * candidates_p, float temp) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates_p->size; ++i) {
-        candidates_p->data[i].logit /= temp;
-    }
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_repetition_penalty(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens, size_t last_tokens_size, float penalty) {
-    if (last_tokens_size == 0 || penalty == 1.0f) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
-        if (token_iter == last_tokens + last_tokens_size) {
-            continue;
-        }
-
-        // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
-        // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
-        if (candidates->data[i].logit <= 0) {
-            candidates->data[i].logit *= penalty;
-        } else {
-            candidates->data[i].logit /= penalty;
-        }
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-void gptneox_sample_frequency_and_presence_penalties(struct gptneox_context * ctx, gptneox_token_data_array * candidates, gptneox_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
-    if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
-        return;
-    }
-
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Create a frequency map to count occurrences of each token in last_tokens
-    std::unordered_map<gptneox_token, int> token_count;
-    for (size_t i = 0; i < last_tokens_size; ++i) {
-        token_count[last_tokens_p[i]]++;
-    }
-
-    // Apply frequency and presence penalties to the candidates
-    for (size_t i = 0; i < candidates->size; ++i) {
-        auto token_iter = token_count.find(candidates->data[i].id);
-        if (token_iter == token_count.end()) {
-            continue;
-        }
-
-        int count = token_iter->second;
-        candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
-    }
-
-    candidates->sorted = false;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-}
-
-
-gptneox_token gptneox_sample_token_mirostat(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, int m, float * mu) {
-    assert(ctx);
-    auto N = float(gptneox_n_vocab(ctx));
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    gptneox_sample_softmax(nullptr, candidates);
-
-    // Estimate s_hat using the most probable m tokens
-    float s_hat = 0.0;
-    float sum_ti_bi = 0.0;
-    float sum_ti_sq = 0.0;
-    for (size_t i = 0; i < size_t(m - 1) && i < candidates->size - 1; ++i) {
-        float t_i = logf(float(i + 2) / float(i + 1));
-        float b_i = logf(candidates->data[i].p / candidates->data[i + 1].p);
-        sum_ti_bi += t_i * b_i;
-        sum_ti_sq += t_i * t_i;
-    }
-    s_hat = sum_ti_bi / sum_ti_sq;
-
-    // Compute k from the estimated s_hat and target surprise value
-    float epsilon_hat = s_hat - 1;
-    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);
-
-    // Sample the next word X using top-k sampling
-    gptneox_sample_top_k(nullptr, candidates, int(k), 1);
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    gptneox_token X = gptneox_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-        ctx->n_sample++;
-    }
-    return X;
-}
-
-gptneox_token gptneox_sample_token_mirostat_v2(struct gptneox_context * ctx, gptneox_token_data_array * candidates, float tau, float eta, float * mu) {
-    assert(ctx);
-    int64_t t_start_sample_us;
-    t_start_sample_us = ggml_time_us();
-
-    gptneox_sample_softmax(ctx, candidates);
-
-    // Truncate the words with surprise values greater than mu
-    candidates->size = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) {
-        return -log2f(candidate.p) > *mu;
-    }));
-
-    // Normalize the probabilities of the remaining words
-    gptneox_sample_softmax(ctx, candidates);
-
-    // Sample the next word X from the remaining words
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    gptneox_token X = gptneox_sample_token(ctx, candidates);
-    t_start_sample_us = ggml_time_us();
-
-    // Compute error as the difference between observed surprise and target surprise value
-    size_t X_idx = std::distance(candidates->data, std::find_if(candidates->data, candidates->data + candidates->size, [&](const gptneox_token_data & candidate) {
-        return candidate.id == X;
-    }));
-    float observed_surprise = -log2f(candidates->data[X_idx].p);
-    float e = observed_surprise - tau;
-
-    // Update mu using the learning rate and error
-    *mu = *mu - eta * e;
-
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    }
-    return X;
-}
-
-gptneox_token gptneox_sample_token_greedy(struct gptneox_context * ctx, gptneox_token_data_array * candidates) {
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    // Find max element
-    auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const gptneox_token_data & a, const gptneox_token_data & b) {
-        return a.logit < b.logit;
-    });
-
-    gptneox_token result = max_iter->id;
-    if (ctx) {
-        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-        ctx->n_sample++;
-    }
-    return result;
-}
-
-gptneox_token gptneox_sample_token(struct gptneox_context * ctx, gptneox_token_data_array * candidates) {
-    assert(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
-    gptneox_sample_softmax(nullptr, candidates);
-
-    std::vector<float> probs;
-    probs.reserve(candidates->size);
-    for (size_t i = 0; i < candidates->size; ++i) {
-        probs.push_back(candidates->data[i].p);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    auto & rng = ctx->rng;
-    int idx = dist(rng);
-
-    gptneox_token result = candidates->data[idx].id;
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-    return result;
-}
-
-//
-// quantization
-//
-
-// temp - load then save model, allows for load and save to be different
-static void gptneox_model_copy_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype) {
-    std::unique_ptr<gptneox_model_loader> model_loader(new gptneox_model_loader(fname_inp.c_str(),
-                                                            /*use_mmap*/ false,
-                                                            /*vocab_only*/ false));
-    gptneox_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
-
-    size_t idx = 0;
-    for (gptneox_load_tensor & tensor : model_loader->tensors_map.tensors) {
-        gptneox_buffer read_data;
-        read_data.resize(tensor.size);
-        tensor.data = read_data.addr;
-        model_loader->load_data_for(tensor);
-
-        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
-               ++idx, model_loader->tensors_map.tensors.size(),
-               tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(),
-               ggml_type_name(tensor.type));
-
-        file_saver.write_tensor(tensor, tensor.type, tensor.data, tensor.size);
-    }
-}
-
-int gptneox_model_copy(
-        const char * fname_inp,
-        const char * fname_out,
-  enum gptneox_ftype   ftype) {
-    try {
-        gptneox_model_copy_internal(fname_inp, fname_out, ftype);
-        return 0;
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to copy: %s\n", __func__, err.c_str());
-        return 1;
-    }
-}
-
-
-static void gptneox_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum gptneox_ftype ftype, int nthread) {
-    ggml_type quantized_type;
-    switch (ftype) {
-        case GPTNEOX_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
-        case GPTNEOX_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
-        case GPTNEOX_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
-        case GPTNEOX_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break;
-        case GPTNEOX_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break;
-        case GPTNEOX_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break;
-        default: throw format("invalid output file type %d\n", ftype);
-    };
-
-    if (nthread <= 0) {
-        nthread = std::thread::hardware_concurrency();
-    }
-
-    std::unique_ptr<gptneox_model_loader> model_loader(new gptneox_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
-                                                                            /*vocab_only*/ false));
-    gptneox_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
-    std::vector<std::thread> workers;
-    std::mutex mutex;
-
-    size_t idx = 0;
-    for (gptneox_load_tensor & tensor : model_loader->tensors_map.tensors) {
-        gptneox_buffer read_data;
-        read_data.resize(tensor.size);
-        tensor.data = read_data.addr;
-        model_loader->load_data_for(tensor);
-
-        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
-               ++idx, model_loader->tensors_map.tensors.size(),
-               tensor.name.c_str(), gptneox_format_tensor_shape(tensor.ne).c_str(),
-               ggml_type_name(tensor.type));
-
-        // This used to be a regex, but <regex> has an extreme cost to compile times.
-        bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
-
-        // quantize only 2D tensors
-        quantize &= (tensor.ne.size() == 2);
-
-        // uncomment this to keep the output layer in FP16
-        //if (tensor.name == "output.weight") {
-        //    quantize = false;
-        //}
-
-        enum ggml_type new_type;
-        void * new_data;
-        size_t new_size;
-        gptneox_buffer work;
-
-        if (!quantize) {
-            new_type = tensor.type;
-            new_data = tensor.data;
-            new_size = tensor.size;
-            printf("size = %8.3f MiB\n", tensor.size/1024.0/1024.0);
-        } else {
-            new_type = quantized_type;
-            float * f32_data;
-            size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
-            gptneox_buffer f32_conv_buf;
-            if (tensor.type == GGML_TYPE_F32) {
-                f32_data = (float *) tensor.data;
-            } else if (tensor.type == GGML_TYPE_F16) {
-                f32_conv_buf.resize(nelements * sizeof(float));
-                f32_data = (float *) f32_conv_buf.addr;
-                auto f16_data = (const ggml_fp16_t *) tensor.data;
-                for (size_t i = 0; i < nelements; i++) {
-                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
-                }
-            } else {
-                throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
-            }
-
-            printf("quantizing .. ");
-            fflush(stdout);
-
-            work.resize(nelements * 4); // upper bound on size
-            new_data = work.addr;
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
-            int chunk_size = 32 * 512;
-            const int nchunk = (nelements + chunk_size - 1)/chunk_size;
-            const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-            if (nthread_use < 2) {
-                new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
-            } else {
-                size_t counter = 0;
-                new_size = 0;
-                auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
-                    std::vector<int64_t> local_hist;
-                    size_t local_size = 0;
-                    while (true) {
-                        std::unique_lock<std::mutex> lock(mutex);
-                        size_t first = counter; counter += chunk_size;
-                        if (first >= nelements) {
-                            if (!local_hist.empty()) {
-                                for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
-                                new_size += local_size;
-                            }
-                            break;
-                        }
-                        lock.unlock();
-                        size_t last = std::min(nelements, first + chunk_size);
-                        if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
-                        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
-                    }
-                };
-                if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
-                compute();
-                for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
-            }
-
-            printf("size = %8.2f MiB -> %8.2f MiB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
-            for (size_t i = 0; i < hist_cur.size(); i++) {
-                hist_all[i] += hist_cur[i];
-            }
-
-            for (size_t i = 0; i < hist_cur.size(); i++) {
-                printf("%5.3f ", hist_cur[i] / float(nelements));
-            }
-            printf("\n");
-        }
-        total_size_org += tensor.size;
-        total_size_new += new_size;
-        file_saver.write_tensor(tensor, new_type, new_data, new_size);
-    }
-
-    printf("%s: model size  = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
-
-    {
-        int64_t sum_all = 0;
-        for (size_t i = 0; i < hist_all.size(); i++) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (size_t i = 0; i < hist_all.size(); i++) {
-            printf("%5.3f ", hist_all[i] / float(sum_all));
-        }
-        printf("\n");
-    }
-}
-
-//
-// interface implementation
-//
-
-struct gptneox_context * gptneox_init_from_file(
-                             const char * path_model,
-            struct gptneox_context_params   params) {
-    ggml_time_init();
-
-    gptneox_context * ctx = new gptneox_context;
-
-    if (params.seed <= 0) {
-        params.seed = time(NULL);
-    }
-
-    unsigned cur_percentage = 0;
-    if (params.progress_callback == NULL) {
-        params.progress_callback_user_data = &cur_percentage;
-        params.progress_callback = [](float progress, void * ctx) {
-            unsigned * cur_percentage_p = (unsigned *) ctx;
-            unsigned percentage = (unsigned) (100 * progress);
-            while (percentage > *cur_percentage_p) {
-                ++*cur_percentage_p;
-                fprintf(stderr, ".");
-                fflush(stderr);
-                if (percentage >= 100) {
-                    fprintf(stderr, "\n");
-                }
-            }
-        };
-    }
-
-    ctx->rng = std::mt19937(params.seed);
-    ctx->logits_all = params.logits_all;
-
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
-    if (!gptneox_model_load(path_model, *ctx, params.n_ctx, memory_type,
-                          params.use_mmap, params.use_mlock, params.vocab_only,
-                          params.progress_callback, params.progress_callback_user_data)) {
-        fprintf(stderr, "%s: failed to load model\n", __func__);
-        gptneox_free(ctx);
-        return nullptr;
-    }
-
-    // reserve memory for context buffers
-    if (!params.vocab_only) {
-        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
-            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
-            gptneox_free(ctx);
-            return nullptr;
-        }
-
-        {
-            const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
-            fprintf(stderr, "%s: kv self size  = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
-        }
-
-        const auto & hparams = ctx->model.hparams;
-
-        // resized during inference
-        if (params.logits_all) {
-            ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
-        } else {
-            ctx->logits.reserve(hparams.n_vocab);
-        }
-
-        if (params.embedding){
-            ctx->embedding.resize(hparams.n_embd);
-        }
-
-        //ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
-
-        ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
-        ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
-    }
-
-    return ctx;
-}
-
-void gptneox_free(struct gptneox_context * ctx) {
-    delete ctx;
-}
-
-int gptneox_model_quantize(
-        const char * fname_inp,
-        const char * fname_out,
-  enum gptneox_ftype   ftype,
-        int          nthread) {
-    try {
-        gptneox_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
-        return 0;
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
-        return 1;
-    }
-}
-
-int gptneox_apply_lora_from_file_internal(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
-    fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
-
-    auto & model = ctx->model;
-
-    const int64_t t_start_lora_us = ggml_time_us();
-
-    auto fin = std::ifstream(path_lora, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
-        return 1;
-    }
-
-    // verify magic and version
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
-            fprintf(stderr, "%s: bad file magic\n", __func__);
-            return 1;
-        }
-        uint32_t format_version;
-        fin.read((char *) &format_version, sizeof(format_version));
-
-        if (format_version != 1) {
-            fprintf(stderr, "%s: unsupported file version\n", __func__ );
-            return 1;
-        }
-    }
-
-    int32_t lora_r;
-    int32_t lora_alpha;
-    fin.read((char *) &lora_r, sizeof(lora_r));
-    fin.read((char *) &lora_alpha, sizeof(lora_alpha));
-    float scaling = (float)lora_alpha / (float)lora_r;
-
-    fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
-
-
-    // create a temporary ggml context to store the lora tensors
-    // todo: calculate size from biggest possible tensor
-    std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
-    struct gml_init_params params;
-    params.mem_size   = lora_buf.size();
-    params.mem_buffer = lora_buf.data();
-    params.no_alloc   = false;
-
-    ggml_context * lora_ctx = ggml_init(params);
-    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
-
-    // create a name -> tensor map of the model to accelerate lookups
-    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
-    for (auto & kv: model.tensors_by_name) {
-        model_tensors.insert(kv);
-    }
-
-
-    // load base model
-    std::unique_ptr<gptneox_model_loader> model_loader;
-    ggml_context * base_ctx = NULL;
-    gptneox_buffer base_buf;
-    if (path_base_model) {
-        fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
-        model_loader.reset(new gptneox_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
-
-        size_t ctx_size, mmapped_size;
-        model_loader->calc_sizes(&ctx_size, &mmapped_size);
-        base_buf.resize(ctx_size);
-
-        ggml_init_params base_params;
-        base_params.mem_size   = base_buf.size;
-        base_params.mem_buffer = base_buf.addr;
-        base_params.no_alloc   = model_loader->use_mmap;
-
-        base_ctx = ggml_init(base_params);
-
-        model_loader->ggml_ctx = base_ctx;
-
-        // maybe this should in gptneox_model_loader
-        if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new gptneox_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
-        }
-    }
-
-    // read tensors and apply
-    bool warned = false;
-    int n_tensors = 0;
-    while (true) {
-        int32_t n_dims;
-        int32_t length;
-        int32_t ftype;
-
-        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-        fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-        if (fin.eof()) {
-            break;
-        }
-
-        int32_t ne[2] = { 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-        }
-
-        std::string name(length, 0);
-        fin.read(&name[0], length);
-
-        // check for lora suffix and get the type of tensor
-        const std::string lora_suffix = ".lora";
-        size_t pos = name.rfind(lora_suffix);
-        if (pos == std::string::npos) {
-            fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
-            return 1;
-        }
-
-        std::string lora_type = name.substr(pos + lora_suffix.length());
-        std::string base_name = name;
-        base_name.erase(pos);
-        // fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
-
-        if (model_tensors.find(base_name.data()) == model_tensors.end()) {
-            fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
-            return 1;
-        }
-
-        // create ggml tensor
-        ggml_type wtype;
-        switch (ftype) {
-            case 0: wtype = GGML_TYPE_F32;  break;
-            case 1: wtype = GGML_TYPE_F16;  break;
-            default:
-                    {
-                        fprintf(stderr, "%s: invalid tensor data type '%d'\n",
-                                __func__, ftype);
-                        return false;
-                    }
-        }
-        ggml_tensor* lora_tensor;
-        if (n_dims == 2) {
-            lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
-        }
-        else {
-            fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
-            return 1;
-        }
-
-        // load tensor data
-        size_t offset = fin.tellg();
-        size_t tensor_data_size = ggml_nbytes(lora_tensor);
-        offset = (offset + 31) & -32;
-        fin.seekg(offset);
-        fin.read((char*)lora_tensor->data, tensor_data_size);
-
-        lora_tensors[name] = lora_tensor;
-
-        // check if we have both A and B tensors and apply
-        if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
-            lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
-
-            ggml_tensor * dest_t = model_tensors[base_name];
-            ggml_tensor * base_t;
-            if (model_loader) {
-                // load from base model
-                if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
-                    fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-                    return 1;
-                }
-                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
-                gptneox_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
-                lt.data = (uint8_t *) lt.ggml_tensor->data;
-                model_loader->load_data_for(lt);
-                lt.ggml_tensor->data = lt.data;
-            }
-            else {
-                base_t = dest_t;
-            }
-
-            if (ggml_is_quantized(base_t->type)) {
-                if (!warned) {
-                    fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-                                    "use a f16 or f32 base model with --lora-base\n", __func__);
-                    warned = true;
-                }
-            }
-
-            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
-            ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
-
-            if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-                fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-                               " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
-                return 1;
-            }
-
-            // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
-
-            if (scaling != 1.0f) {
-                ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
-                BA = ggml_scale(lora_ctx, BA, scale_tensor);
-            }
-
-            ggml_tensor * r;
-            if (base_t == dest_t) {
-                r = ggml_add_inplace(lora_ctx, dest_t, BA);
-            }
-            else {
-                r = ggml_add(lora_ctx, base_t, BA);
-                r = ggml_cpy(lora_ctx, r, dest_t);
-            }
-
-            struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
-
-            // we won't need these tensors again, reset the context to save memory
-            ggml_free(lora_ctx);
-            lora_ctx = ggml_init(params);
-            lora_tensors.clear();
-
-            n_tensors++;
-            if (n_tensors % 4 == 0)
-                fprintf(stderr, ".");
-        }
-    }
-
-    // TODO: this should be in a destructor, it will leak on failure
-    ggml_free(lora_ctx);
-    if (base_ctx) {
-        ggml_free(base_ctx);
-    }
-
-    const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
-    fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
-
-    return 0;
-}
-
-int gptneox_apply_lora_from_file(struct gptneox_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
-    try {
-        return gptneox_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
-    } catch (const std::string & err) {
-        fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
-        return 1;
-    }
-}
-
-int gptneox_get_kv_cache_token_count(struct gptneox_context * ctx) {
-    return ctx->model.kv_self.n;
-}
-
-#define GPTNEOX_MAX_RNG_STATE 64*1024
-
-void gptneox_set_rng_seed(struct gptneox_context * ctx, int seed) {
-    if (seed <= 0) {
-        seed = time(NULL);
-    }
-    ctx->rng.seed(seed);
-}
-
-// Returns the size of the state
-size_t gptneox_get_state_size(struct gptneox_context * ctx) {
-    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
-    // for reference, std::mt19937(1337) serializes to 6701 bytes.
-    const size_t s_rng_size        = sizeof(size_t);
-    const size_t s_rng             = GPTNEOX_MAX_RNG_STATE;
-    const size_t s_logits_capacity = sizeof(size_t);
-    const size_t s_logits_size     = sizeof(size_t);
-    const size_t s_logits          = ctx->logits.capacity() * sizeof(float);
-    const size_t s_embedding_size  = sizeof(size_t);
-    const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
-    const size_t s_kv_size         = sizeof(size_t);
-    const size_t s_kv_ntok         = sizeof(int);
-    const size_t s_kv              = ctx->model.kv_self.buf.size;
-
-    const size_t s_total = (
-        + s_rng_size
-        + s_rng
-        + s_logits_capacity
-        + s_logits_size
-        + s_logits
-        + s_embedding_size
-        + s_embedding
-        + s_kv_size
-        + s_kv_ntok
-        + s_kv
-    );
-
-    return s_total;
-}
-
-// Copies the state to the specified destination address
-size_t gptneox_copy_state_data(struct gptneox_context * ctx, uint8_t * dest) {
-    uint8_t * out = dest;
-
-    // copy rng
-    {
-        std::stringstream rng_ss;
-        rng_ss << ctx->rng;
-
-        const size_t rng_size = rng_ss.str().size();
-        char rng_buf[GPTNEOX_MAX_RNG_STATE];
-
-        memset(&rng_buf[0], 0, GPTNEOX_MAX_RNG_STATE);
-        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
-
-        memcpy(out, &rng_size,   sizeof(rng_size));    out += sizeof(rng_size);
-        memcpy(out, &rng_buf[0], GPTNEOX_MAX_RNG_STATE); out += GPTNEOX_MAX_RNG_STATE;
-    }
-
-    // copy logits
-    {
-        const size_t logits_cap  = ctx->logits.capacity();
-        const size_t logits_size = ctx->logits.size();
-
-        memcpy(out, &logits_cap,  sizeof(logits_cap));  out += sizeof(logits_cap);
-        memcpy(out, &logits_size, sizeof(logits_size)); out += sizeof(logits_size);
-
-        if (logits_size) {
-            memcpy(out, ctx->logits.data(), logits_size * sizeof(float));
-        }
-
-        out += logits_cap * sizeof(float);
-    }
-
-    // copy embeddings
-    {
-        const size_t embedding_size = ctx->embedding.size();
-
-        memcpy(out, &embedding_size, sizeof(embedding_size)); out += sizeof(embedding_size);
-
-        if (embedding_size) {
-            memcpy(out, ctx->embedding.data(), embedding_size * sizeof(float));
-            out += embedding_size * sizeof(float);
-        }
-    }
-
-    // copy kv cache
-    {
-        const size_t kv_size = ctx->model.kv_self.buf.size;
-        const int    kv_ntok = gptneox_get_kv_cache_token_count(ctx);
-
-        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
-        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
-
-        if (kv_size) {
-            memcpy(out, ctx->model.kv_self.buf.addr, kv_size); out += kv_size;
-        }
-    }
-
-    const size_t written  = out - dest;
-    const size_t expected = gptneox_get_state_size(ctx);
-
-    GPTNEOX_ASSERT(written == expected);
-
-    return written;
-}
-
-// Sets the state reading from the specified source address
-size_t gptneox_set_state_data(struct gptneox_context * ctx, const uint8_t * src) {
-    const uint8_t * in = src;
-
-    // set rng
-    {
-        size_t rng_size;
-        char   rng_buf[GPTNEOX_MAX_RNG_STATE];
-
-        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
-        memcpy(&rng_buf[0], in, GPTNEOX_MAX_RNG_STATE); in += GPTNEOX_MAX_RNG_STATE;
-
-        std::stringstream rng_ss;
-        rng_ss.str(std::string(&rng_buf[0], rng_size));
-        rng_ss >> ctx->rng;
-
-        GPTNEOX_ASSERT(rng_ss.fail() == false);
-    }
-
-    // set logits
-    {
-        size_t logits_cap;
-        size_t logits_size;
-
-        memcpy(&logits_cap,  in, sizeof(logits_cap));  in += sizeof(logits_cap);
-        memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
-
-        GPTNEOX_ASSERT(ctx->logits.capacity() == logits_cap);
-
-        if (logits_size) {
-            ctx->logits.resize(logits_size);
-            memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
-        }
-
-        in += logits_cap * sizeof(float);
-    }
-
-    // set embeddings
-    {
-        size_t embedding_size;
-
-        memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
-
-        GPTNEOX_ASSERT(ctx->embedding.capacity() == embedding_size);
-
-        if (embedding_size) {
-            memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
-            in += embedding_size * sizeof(float);
-        }
-    }
-
-    // set kv cache
-    {
-        size_t kv_size;
-        int kv_ntok;
-
-        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
-        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
-
-        if (kv_size) {
-            GPTNEOX_ASSERT(ctx->model.kv_self.buf.size == kv_size);
-
-            void * k_data = ctx->model.kv_self.k->data; // remember data pointers
-            void * v_data = ctx->model.kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
-
-            memcpy(ctx->model.kv_self.buf.addr, in, kv_size); in += kv_size;
-
-            ctx->model.kv_self.k->data = k_data; // restore correct data pointers
-            ctx->model.kv_self.v->data = v_data;
-
-        }
-
-        ctx->model.kv_self.n = kv_ntok;
-    }
-
-    const size_t nread    = in - src;
-    const size_t expected = gptneox_get_state_size(ctx);
-
-    GPTNEOX_ASSERT(nread == expected);
-
-    return nread;
-}
-
-int gptneox_eval(
-        struct gptneox_context * ctx,
-           const gptneox_token * tokens,
-                         int   n_tokens,
-                         int   n_past,
-                         int   n_threads) {
-    if (!gptneox_eval_internal(*ctx, tokens, n_tokens, n_past, n_threads)) {
-        fprintf(stderr, "%s: failed to eval\n", __func__);
-        return 1;
-    }
-    // get a more accurate load time, upon first eval
-    if (!ctx->has_evaluated_once) {
-        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
-        ctx->has_evaluated_once = true;
-    }
-    return 0;
-}
-
-int gptneox_tokenize(
-        struct gptneox_context * ctx,
-                  const char * text,
-                 gptneox_token * tokens,
-                         int   n_max_tokens,
-                        bool   add_bos) {
-    auto res = gptneox_tokenize(ctx->vocab, text, add_bos);
-
-    if (n_max_tokens < (int) res.size()) {
-        fprintf(stderr, "%s: too many tokens\n", __func__);
-        return -((int) res.size());
-    }
-
-    for (size_t i = 0; i < res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
-}
-
-int gptneox_n_vocab(struct gptneox_context * ctx) {
-    return ctx->vocab.id_to_token.size();
-}
-
-int gptneox_n_ctx(struct gptneox_context * ctx) {
-    return ctx->model.hparams.n_ctx;
-}
-
-int gptneox_n_embd(struct gptneox_context * ctx) {
-    return ctx->model.hparams.n_embd;
-}
-
-float * gptneox_get_logits(struct gptneox_context * ctx) {
-    return ctx->logits.data();
-}
-
-float * gptneox_get_embeddings(struct gptneox_context * ctx) {
-    return ctx->embedding.data();
-}
-
-const char * gptneox_token_to_str(struct gptneox_context * ctx, gptneox_token token) {
-    if (token >= gptneox_n_vocab(ctx)) {
-        return nullptr;
-    }
-
-    return ctx->vocab.id_to_token[token].tok.c_str();
-}
-
-gptneox_token gptneox_str_to_token(struct gptneox_context * ctx, const char * str) {
-    return ctx->vocab.token_to_id[str];
-}
-
-gptneox_token gptneox_token_bos() {
-    return 0;
-}
-
-gptneox_token gptneox_token_eos() {
-    return 0;
-}
-
-// Varies depending on gptneox model, use gptneox_str_to_token instead
-gptneox_token gptneox_token_nl() {
-    return 13;
-}
-
-
-void gptneox_print_timings(struct gptneox_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
-
-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
-
-    fprintf(stderr, "\n");
-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
-    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
-}
-
-void gptneox_reset_timings(struct gptneox_context * ctx) {
-    ctx->t_start_us = ggml_time_us();
-    ctx->t_sample_us = ctx->n_sample = 0;
-    ctx->t_eval_us   = ctx->n_eval   = 0;
-    ctx->t_p_eval_us = ctx->n_p_eval = 0;
-}
-
-
-// For internal test use
-std::vector<std::pair<std::string, struct ggml_tensor *>>& gptneox_internal_get_tensor_map(struct gptneox_context * ctx) {
-    return ctx->model.tensors_by_name;
-}
-
-size_t gptneox_load_session_file(struct gptneox_context * ctx, const char * path_session, gptneox_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) {
-    // TODO leverage mmap
-    gptneox_file file(path_session, "rb");
-    const uint32_t magic = file.read_u32();
-    const uint32_t version = file.read_u32();
-
-    if (!(magic == 'ggsn' && version == 0)) {
-        fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version);
-        return 0;
-    }
-
-    gptneox_hparams session_hparams;
-    file.read_raw(&session_hparams, sizeof(gptneox_hparams));
-
-    // REVIEW
-    if (session_hparams != ctx->model.hparams) {
-        fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__);
-        return 0;
-    }
-
-    const uint32_t n_token_count = file.read_u32();
-    GPTNEOX_ASSERT(n_token_capacity >= n_token_count);
-    file.read_raw(tokens_out, sizeof(gptneox_token) * n_token_count);
-    *n_token_count_out = n_token_count;
-
-    const size_t n_state_size = file.size - file.tell();
-    const size_t n_orig_state_size = gptneox_get_state_size(ctx);
-    if (n_state_size != n_orig_state_size) {
-        fprintf(stderr, "%s : failed to validate state size\n", __func__);
-    }
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    file.read_raw(state_data.get(), n_state_size);
-    return gptneox_set_state_data(ctx, state_data.get());
-}
-
-size_t gptneox_save_session_file(struct gptneox_context * ctx, const char * path_session, const gptneox_token * tokens, size_t n_token_count) {
-    // TODO save temp & swap
-    gptneox_file file(path_session, "wb");
-
-    const size_t n_state_size = gptneox_get_state_size(ctx);
-    std::unique_ptr<uint8_t[]> state_data(new uint8_t[n_state_size]);
-    gptneox_copy_state_data(ctx, state_data.get());
-
-    file.write_u32('ggsn'); // magic
-    file.write_u32(0); // version
-    file.write_raw(&ctx->model.hparams, sizeof(gptneox_hparams));
-
-    file.write_u32((uint32_t) n_token_count); // REVIEW
-    file.write_raw(tokens, sizeof(gptneox_token) * n_token_count);
-
-    file.write_raw(state_data.get(), n_state_size);
-    return n_state_size; // REVIEW
-}
-
-
-static std::atomic<bool> is_interacting;
-static std::atomic<bool> is_terminated;
-
-#define EPHEMERAL(fmt) "\r\e[K\033[1;35m" fmt " \033[0m"
-
-static void sigint_handler_batch(int signo) {
-    is_terminated = true;
-}
-
-static void sigint_handler_interactive(int signo) {
-    if (!is_interacting) {
-        is_interacting = true;
-    } else {
-        is_terminated = true;
-    }
-}
-
-static int CompareTime(struct timespec a, struct timespec b) {
-  int cmp;
-  if (!(cmp = (a.tv_sec > b.tv_sec) - (a.tv_sec < b.tv_sec))) {
-    cmp = (a.tv_nsec > b.tv_nsec) - (a.tv_nsec < b.tv_nsec);
-  }
-  return cmp;
-}
-
-static int on_missing_feature(const char *name) {
-    fprintf(stderr, "%s: error: cpuid %s not detected\n", __func__, name);
-    fprintf(stderr, "%s: amd microprocessors made after 2017 usually work\n", __func__);
-    fprintf(stderr, "%s: intel microprocessors made after 2013 usually work\n", __func__);
-    return 1;
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-    // Models can be freely downloaded from:
-    // https://huggingface.co/ceonlabs/radpajama/tree/main
-    // Preferred one in float16 format, this can be used to generate the quantized one.
-    // ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin 5.55GB
-    // Quantized for faster inference, 8bit integers.
-    // ggml-RedPajama-INCITE-Chat-3B-v1-q8_0.bin 3.13 GB
-    params.model = "ggml-RedPajama-INCITE-Chat-3B-v1-f16.bin";
-    
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-
-
-    ShowCrashReports();
-    setvbuf(stdin, NULL, _IONBF, 0);
-    setvbuf(stdout, NULL, _IONBF, 0);
-    setvbuf(stderr, NULL, _IONBF, 0);
-
-
-#ifdef __x86_64__
-    if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
-    if (!X86_HAVE(AVX)) return on_missing_feature("avx");
-    if (!X86_HAVE(FMA)) return on_missing_feature("fma");
-    if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
-    if (!X86_HAVE(F16C)) {
-        fprintf(stderr, "%s: warning: cpuid f16c not detected; inference might crash\n", __func__);
-    }
-#endif /* __x86_64__ */
-
-    gptneox_context * ctx;
-    g_ctx = &ctx;
-
-       // load the model
-    {
-        auto lparams = gptneox_context_default_params();
-
-        lparams.n_ctx      = params.n_ctx;
-        lparams.n_parts    = params.n_parts;
-        lparams.seed       = params.seed;
-        lparams.f16_kv     = params.memory_f16;
-        lparams.use_mmap   = params.use_mmap;
-        lparams.use_mlock  = params.use_mlock;
-
-        ctx = gptneox_init_from_file(params.model.c_str(), lparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-            return 1;
-        }
-    }
-
-   if (!params.lora_adapter.empty()) {
-        int err = gptneox_apply_lora_from_file(ctx,
-                                               params.lora_adapter.c_str(),
-                                               params.lora_base.empty() ? NULL : params.lora_base.c_str(),
-                                               params.n_threads);
-        if (err != 0) {
-            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return 1;
-        }
-    }
-    
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "system_info: n_threads = %d / %d | \n",
-                params.n_threads, std::thread::hardware_concurrency());
-    }
-    
-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
-    // uncomment the "used_mem" line in llama.cpp to see the results
-    if (params.mem_test) {
-        {
-            const std::vector<gptneox_token> tmp(params.n_batch, 0);
-            gptneox_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
-        
-        {
-            const std::vector<gptneox_token> tmp = { 0, };
-            gptneox_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
-        }
-        
-        gptneox_print_timings(ctx);
-        gptneox_free(ctx);
-        
-        return 0;
-    }
-
-    
-    // Always interactive in Open-Assistant
-    params.interactive = true;
-    
-    if (params.interactive) {
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-        struct sigaction sigint_action;
-        sigint_action.sa_handler = sigint_handler;
-        sigemptyset (&sigint_action.sa_mask);
-        sigint_action.sa_flags = 0;
-        sigaction(SIGINT, &sigint_action, NULL);
-#elif defined (_WIN32)
-        signal(SIGINT, sigint_handler);
-#endif
-    }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
-        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
-    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", params.n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    fprintf(stderr, "\n\n");
-    
-    // TODO: replace with ring-buffer
-    std::vector<gptneox_token> last_n_tokens = std::vector<gptneox_token>();
-    //std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
-    
-    
-    if (params.interactive) {
-        printf("== Running in interactive mode. ==\n"
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-               " - Press Ctrl+C to interject at any time.\n"
-#endif
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
-    }
-    
-    const int32_t top_k          = params.top_k;
-    const float   top_p          = params.top_p;
-    const float   temp           = params.temp;
-    const float   repeat_penalty = params.repeat_penalty;
-    
-    // Chat loop
-    while (true) {
-        is_interacting = true;
-        
-        int n_past = 0;
-        
-        // Get input
-        
-        // potentially set color to indicate we are taking user input
-        
-#if defined (_WIN32)
-        // Windows: must reactivate sigint handler after each signal
-        signal(SIGINT, sigint_handler);
-#endif
-
-        if (params.instruct) {
-            printf("\n<human>: ");
-        }
-
-        std::string buffer;
-        if (!params.input_prefix.empty()) {
-            buffer += params.input_prefix;
-            printf("%s", buffer.c_str());
-        }
-
-        std::string line;
-        bool another_line = true;
-        do {
-#if defined(_WIN32)
-            std::wstring wline;
-            if (!std::getline(std::wcin, wline)) {
-                // input stream is bad or EOF received
-                return 0;
-            }
-            win32_utf8_encode(wline, line);
-#else
-            if (!std::getline(std::cin, line)) {
-                // input stream is bad or EOF received
-                return 0;
-            }
-#endif
-            if (line.empty() || line.back() != '\\') {
-                another_line = false;
-            } else {
-                line.pop_back(); // Remove the continue character
-            }
-            buffer += line;
-            if (another_line) {
-                buffer += '\n';
-            }
-        } while (another_line);
-        
-        is_interacting = false;
-        
-        // done taking input, reset color
-        
-        // Check for input
-        if (buffer.length() <= 0) {
-            continue; // Restart loop for input
-        }
-        
-        // Tokenize prompt with oasst special tokens
-
-        auto prompt_embd = ::gptneox_tokenize(ctx, buffer, false);
-        auto embd_inp = std::vector<gptneox_token>();
-
-        // Redpajama: insert special tokens for OA. (prefix)
-        embd_inp.push_back(gptneox_str_to_token(ctx, "<"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, "human"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, ">:"));
-        
-        embd_inp.insert(embd_inp.end(), prompt_embd.begin(), prompt_embd.end());
-
-        // Redpajama: insert special tokens for OA. (postfix)
-        embd_inp.push_back(gptneox_str_to_token(ctx, "<"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, "bot"));
-        embd_inp.push_back(gptneox_str_to_token(ctx, ">:"));
-       
-        
-        // Verbose prompt
-        if (params.verbose_prompt) {
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s: prompt: '%s'\n", __func__, buffer.c_str());
-            fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
-            for (int i = 0; i < (int) embd_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], gptneox_token_to_str(ctx, embd_inp[i]));
-            }
-            /*if (params.n_keep > 0) {
-            fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
-                for (int i = 0; i < params.n_keep; i++) {
-                    fprintf(stderr, "%s", gptneox_token_to_str(ctx, embd_inp[i]));
-                }
-                fprintf(stderr, "'\n");
-            }
-             */
-            fprintf(stderr, "\n");
-        }
-        
-        // How many tokens to generate - check if theres space in context for atleast one token (or batch size tokens?)
-        auto inp_size = embd_inp.size();
-        auto space = params.n_ctx - inp_size;
-        if(space <= 0) {
-            fprintf(stderr, "%s : input too long\n", __func__);
-            continue;
-        }
-        // Send batches to eval
-        while (n_past < inp_size) {
-            auto remaining = inp_size - n_past;
-            int n_eval = params.n_batch < remaining ? params.n_batch : remaining;
-            if (gptneox_eval(ctx, &embd_inp[n_past], n_eval, n_past, params.n_threads)) {
-                fprintf(stderr, "<bot>: %s : failed to eval\n", __func__);
-                return 1;
-            }
-            n_past += n_eval;
-        }
-        
-        const int n_ctx = gptneox_n_ctx(ctx);
-        const int n_vocab = gptneox_n_vocab(ctx);
-        
-        const float   temp            = params.temp;
-        const int32_t top_k           = params.top_k <= 0 ? gptneox_n_vocab(ctx) : params.top_k;
-        const float   top_p           = params.top_p;
-        const float   tfs_z           = params.tfs_z;
-        const float   typical_p       = params.typical_p;
-        const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-        const float   repeat_penalty  = params.repeat_penalty;
-        const float   alpha_presence  = params.presence_penalty;
-        const float   alpha_frequency = params.frequency_penalty;
-        const int     mirostat        = params.mirostat;
-        const float   mirostat_tau    = params.mirostat_tau;
-        const float   mirostat_eta    = params.mirostat_eta;
-        const bool    penalize_nl     = params.penalize_nl;
-        
-        // Eval until space runs out
-        auto out_count = 0;
-        
-        printf("<bot>:");
-        while (space > 0) {
-            // Get token
-            gptneox_token id = 0;
-            
-            {
-                auto logits = gptneox_get_logits(ctx);
-                
-                // Apply params.logit_bias map
-                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-                    logits[it->first] += it->second;
-                }
-
-                std::vector<gptneox_token_data> candidates;
-                candidates.reserve(n_vocab);
-                for (gptneox_token token_id = 0; token_id < n_vocab; token_id++) {
-                    candidates.emplace_back(gptneox_token_data{token_id, logits[token_id], 0.0f});
-                }
-
-                gptneox_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-                // Apply penalties
-                gptneox_token nl_token = gptneox_str_to_token(ctx, "\n");
-                float nl_logit = logits[nl_token];
-                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
-                gptneox_sample_repetition_penalty(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, repeat_penalty);
-                gptneox_sample_frequency_and_presence_penalties(ctx, &candidates_p,
-                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
-                    last_n_repeat, alpha_frequency, alpha_presence);
-                if (!penalize_nl) {
-                    logits[nl_token] = nl_logit;
-                }
-
-                if (temp <= 0) {
-                    // Greedy sampling
-                    id = gptneox_sample_token_greedy(ctx, &candidates_p);
-                } else {
-                    if (mirostat == 1) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        const int mirostat_m = 100;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-                    } else if (mirostat == 2) {
-                        static float mirostat_mu = 2.0f * mirostat_tau;
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-                    } else {
-                        // Temperature sampling
-                        gptneox_sample_top_k(ctx, &candidates_p, top_k, 1);
-                        gptneox_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
-                        gptneox_sample_typical(ctx, &candidates_p, typical_p, 1);
-                        gptneox_sample_top_p(ctx, &candidates_p, top_p, 1);
-                        gptneox_sample_temperature(ctx, &candidates_p, temp);
-                        id = gptneox_sample_token(ctx, &candidates_p);
-                    }
-                }
-            }
-            
-            // Inc out count and dec space
-            out_count += 1;
-            space -= 1;
-            // Repeat tokens update
-            last_n_tokens.push_back(id);
-            if (last_n_tokens.size() > params.repeat_last_n) {
-                last_n_tokens.erase(last_n_tokens.begin());
-            }
-            // Redpajama: check if the interactive is done. 
-            //std::cout<<" last_n_tokens.size: "<< last_n_tokens[0] <<" "<< last_n_tokens[1] <<" "<< last_n_tokens[2] << std::endl;
-            if (last_n_tokens.size()==3 && last_n_tokens[0]==gptneox_str_to_token(ctx, "<") 
-            && last_n_tokens[1]==gptneox_str_to_token(ctx, "human") && last_n_tokens[2]==gptneox_str_to_token(ctx, ">:")){
-                space = 0;
-                continue;
-            }
-
-            // Check for eos - end early - check eos before bos in case they are the same
-            if (id == gptneox_token_eos()) {
-                space = 0;
-                continue;
-            }
-            // Check for bos - skip callback if so
-            if (id == gptneox_token_bos()) {
-                continue;
-            }
-            // Convert token to string and display
-            // printf("%s(%d)", gptneox_token_to_str(ctx, id), id);
-            
-            
-            if (last_n_tokens[2]==gptneox_str_to_token(ctx, "<")){
-                ;
-            }
-            else if (last_n_tokens[2]==gptneox_str_to_token(ctx, "human")){
-                if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){
-                    ;
-                }
-                else{
-                    printf("%s", gptneox_token_to_str(ctx, id));
-                }
-            }
-            else if (last_n_tokens[1]==gptneox_str_to_token(ctx, "<")){
-                    printf("<");
-                    printf("%s", gptneox_token_to_str(ctx, id));
-                }
-            else{
-                printf("%s", gptneox_token_to_str(ctx, id));
-            }
-            fflush(stdout);
-            // Check if we need to run another eval
-            if (space > 0) {
-                // Send generated token back into model for next generation
-                if (gptneox_eval(ctx, &id, 1, n_past, params.n_threads)) {
-                    fprintf(stderr, "%s : failed to eval\n", __func__);
-                    return 1;
-                }
-                // Increment past count
-                n_past += 1;
-            }
-            // Check for user interrupt
-            if (is_interacting) { space = 0; }
-        }
-        printf("\n"); 
-        //printf("\n %d", space);
-        fflush(stdout);
-     }
-
-    gptneox_print_timings(ctx);
-    gptneox_free(ctx);
-
-
-    return 0;
-}
diff --git a/third_party/radpajama/radpajama.mk b/third_party/radpajama/radpajama.mk
index e2fa901a4..e197d7282 100644
--- a/third_party/radpajama/radpajama.mk
+++ b/third_party/radpajama/radpajama.mk
@@ -1,76 +1,30 @@
 #-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
 #───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
 
-PKGS += THIRD_PARTY_REDPAJAMA
+PKGS += THIRD_PARTY_RADPAJAMA
 
 ################################################################################
-# single file machine learning framework written in c
-# make -j8 o//third_party/radpajama/ggml.a
+# redpajama library code common to both executables below
 
-THIRD_PARTY_REDPAJAMA_ARTIFACTS += THIRD_PARTY_REDPAJAMA_A
-THIRD_PARTY_REDPAJAMA = $(THIRD_PARTY_REDPAJAMA_A_DEPS) $(THIRD_PARTY_REDPAJAMA_A)
-THIRD_PARTY_REDPAJAMA_A = o/$(MODE)/third_party/radpajama/ggml.a
-THIRD_PARTY_REDPAJAMA_A_HDRS = third_party/radpajama/ggml.h
-THIRD_PARTY_REDPAJAMA_A_SRCS = third_party/radpajama/ggml.c
-THIRD_PARTY_REDPAJAMA_A_OBJS = $(THIRD_PARTY_REDPAJAMA_A_SRCS:%.c=o/$(MODE)/%.o)
-THIRD_PARTY_REDPAJAMA_A_FILES = $(THIRD_PARTY_REDPAJAMA_A_SRCS) $(THIRD_PARTY_REDPAJAMA_A_HDRS)
-THIRD_PARTY_REDPAJAMA_A_CHECKS = $(THIRD_PARTY_REDPAJAMA_A).pkg $(THIRD_PARTY_REDPAJAMA_A_HDRS:%=o/$(MODE)/%.ok)
+THIRD_PARTY_RADPAJAMA_ARTIFACTS += THIRD_PARTY_RADPAJAMA_A
+THIRD_PARTY_RADPAJAMA = $(THIRD_PARTY_RADPAJAMA_A_DEPS) $(THIRD_PARTY_RADPAJAMA_A)
+THIRD_PARTY_RADPAJAMA_A = o/$(MODE)/third_party/radpajama/radpajama.a
+THIRD_PARTY_RADPAJAMA_A_OBJS = $(THIRD_PARTY_RADPAJAMA_A_SRCS:%.cc=o/$(MODE)/%.o)
+THIRD_PARTY_RADPAJAMA_A_FILES = $(THIRD_PARTY_RADPAJAMA_A_SRCS) $(THIRD_PARTY_RADPAJAMA_A_HDRS)
+THIRD_PARTY_RADPAJAMA_A_CHECKS = $(THIRD_PARTY_RADPAJAMA_A).pkg $(THIRD_PARTY_RADPAJAMA_A_HDRS:%=o/$(MODE)/%.okk)
 
-THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS =						\
-	LIBC_CALLS							\
-	LIBC_INTRIN							\
-	LIBC_MEM							\
-	LIBC_NEXGEN32E							\
-	LIBC_RUNTIME							\
-	LIBC_STDIO							\
-	LIBC_THREAD							\
-	LIBC_STR							\
-	LIBC_STUBS							\
-	LIBC_SYSV							\
-	LIBC_TINYMATH							\
-	THIRD_PARTY_COMPILER_RT
+THIRD_PARTY_RADPAJAMA_A_HDRS =						\
+	third_party/radpajama/common-gptneox.h				\
+	third_party/radpajama/gptneox-util.h				\
+	third_party/radpajama/gptneox.h
 
-THIRD_PARTY_REDPAJAMA_A_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS),$($(x))))
+THIRD_PARTY_RADPAJAMA_A_SRCS =						\
+	third_party/radpajama/common-gptneox.cc				\
+	third_party/radpajama/copy-gptneox.cc				\
+	third_party/radpajama/gptneox.cc				\
+	third_party/radpajama/quantize-gptneox.cc
 
-$(THIRD_PARTY_REDPAJAMA_A):							\
-		third_party/radpajama/					\
-		$(THIRD_PARTY_REDPAJAMA_A).pkg				\
-		$(THIRD_PARTY_REDPAJAMA_A_OBJS)
-
-$(THIRD_PARTY_REDPAJAMA_A).pkg:						\
-		$(THIRD_PARTY_REDPAJAMA_A_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_REDPAJAMA_A_DIRECTDEPS),$($(x)_A).pkg)
-
-$(THIRD_PARTY_REDPAJAMA_A_OBJS): private					\
-		OVERRIDE_CFLAGS +=					\
-			-O3						\
-			-ffunction-sections				\
-			-fdata-sections
-
-ifeq ($(ARCH), x86_64)
-$(THIRD_PARTY_REDPAJAMA_A_OBJS): private					\
-		OVERRIDE_CFLAGS +=					\
-			-msse3						\
-			-mavx						\
-			-mavx2						\
-			-mf16c						\
-			-mfma
-endif
-
-################################################################################
-# command for running inference on large language models
-# make -j8 o//third_party/radpajama/radpajama.com
-
-THIRD_PARTY_REDPAJAMA_ARTIFACTS += THIRD_PARTY_REDPAJAMA_LLAMA
-THIRD_PARTY_REDPAJAMA_LLAMA = o/$(MODE)/third_party/radpajama/radpajama.com
-THIRD_PARTY_REDPAJAMA_LLAMA_HDRS = third_party/radpajama/llama.h third_party/radpajama/llama_util.h third_party/radpajama/common.h
-THIRD_PARTY_REDPAJAMA_LLAMA_SRCS = third_party/radpajama/radpajama.cc third_party/radpajama/common.cc
-THIRD_PARTY_REDPAJAMA_LLAMA_OBJS = $(THIRD_PARTY_REDPAJAMA_LLAMA_SRCS:%.cc=o/$(MODE)/%.o)
-THIRD_PARTY_REDPAJAMA_LLAMA_FILES := $(THIRD_PARTY_REDPAJAMA_LLAMA_SRCS) $(THIRD_PARTY_REDPAJAMA_LLAMA_HDRS)
-THIRD_PARTY_REDPAJAMA_LLAMA_CHECKS = $(THIRD_PARTY_REDPAJAMA_LLAMA).pkg $(THIRD_PARTY_REDPAJAMA_LLAMA_HDRS:%=o/$(MODE)/%.okk)
-
-THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS =					\
+THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS =					\
 	LIBC_CALLS							\
 	LIBC_FMT							\
 	LIBC_INTRIN							\
@@ -78,50 +32,92 @@ THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS =					\
 	LIBC_NEXGEN32E							\
 	LIBC_RUNTIME							\
 	LIBC_STDIO							\
-	LIBC_LOG							\
 	LIBC_STR							\
 	LIBC_STUBS							\
 	LIBC_SYSV							\
 	LIBC_THREAD							\
 	LIBC_TINYMATH							\
-	LIBC_ZIPOS							\
-	THIRD_PARTY_REDPAJAMA						\
+	THIRD_PARTY_COMPILER_RT						\
+	THIRD_PARTY_GGML						\
 	THIRD_PARTY_LIBCXX
 
-THIRD_PARTY_REDPAJAMA_LLAMA_DEPS :=						\
-	$(call uniq,$(foreach x,$(THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS),$($(x))))
+THIRD_PARTY_RADPAJAMA_A_DEPS :=						\
+	$(call uniq,$(foreach x,$(THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS),$($(x))))
 
-$(THIRD_PARTY_REDPAJAMA_LLAMA).dbg:						\
-		$(THIRD_PARTY_REDPAJAMA_LLAMA).pkg				\
-		$(THIRD_PARTY_REDPAJAMA_LLAMA_DEPS)				\
-		o/$(MODE)/third_party/radpajama/radpajama.txt.zip.o	\
-		o/$(MODE)/third_party/radpajama/common.o			\
-		o/$(MODE)/third_party/radpajama/llama.o			\
-		o/$(MODE)/third_party/radpajama/radpajama.o			\
+$(THIRD_PARTY_RADPAJAMA_A):						\
+		third_party/radpajama/					\
+		$(THIRD_PARTY_RADPAJAMA_A).pkg				\
+		$(THIRD_PARTY_RADPAJAMA_A_OBJS)
+
+$(THIRD_PARTY_RADPAJAMA_A).pkg:						\
+		$(THIRD_PARTY_RADPAJAMA_A_OBJS)				\
+		$(foreach x,$(THIRD_PARTY_RADPAJAMA_A_DIRECTDEPS),$($(x)_A).pkg)
+
+################################################################################
+# two executable programs for running inference on redpajama models
+#
+#     make -j8 o//third_party/radpajama/radpajama.com
+#     make -j8 o//third_party/radpajama/radpajama-chat.com
+
+THIRD_PARTY_RADPAJAMA_ARTIFACTS += THIRD_PARTY_RADPAJAMA_MAIN
+THIRD_PARTY_RADPAJAMA_MAIN_OBJS = $(THIRD_PARTY_RADPAJAMA_MAIN_SRCS:%.cc=o/$(MODE)/%.o)
+THIRD_PARTY_RADPAJAMA_MAIN_BINS = $(THIRD_PARTY_RADPAJAMA_COMS) $(THIRD_PARTY_RADPAJAMA_COMS:%=%.dbg)
+
+THIRD_PARTY_RADPAJAMA_MAIN_COMS =					\
+	o/$(MODE)/third_party/radpajama/radpajama.com			\
+	o/$(MODE)/third_party/radpajama/radpajama-chat.com
+
+THIRD_PARTY_RADPAJAMA_MAIN_SRCS =					\
+	third_party/radpajama/main-redpajama.cc				\
+	third_party/radpajama/main-redpajama-chat.cc
+
+THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS =					\
+	LIBC_CALLS							\
+	LIBC_INTRIN							\
+	LIBC_NEXGEN32E							\
+	LIBC_RUNTIME							\
+	LIBC_STDIO							\
+	LIBC_STR							\
+	LIBC_STUBS							\
+	THIRD_PARTY_RADPAJAMA						\
+	THIRD_PARTY_LIBCXX
+
+THIRD_PARTY_RADPAJAMA_MAIN_DEPS :=					\
+	$(call uniq,$(foreach x,$(THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS),$($(x))))
+
+o/$(MODE)/third_party/radpajama/main.pkg:				\
+		$(THIRD_PARTY_RADPAJAMA_MAIN_OBJS)			\
+		$(foreach x,$(THIRD_PARTY_RADPAJAMA_MAIN_DIRECTDEPS),$($(x)_A).pkg)
+
+o/$(MODE)/third_party/radpajama/radpajama.com.dbg:			\
+		o/$(MODE)/third_party/radpajama/main.pkg		\
+		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
+		o/$(MODE)/third_party/radpajama/main-redpajama.o	\
 		$(CRT)							\
 		$(APE_NO_MODIFY_SELF)
 	@$(APELINK)
 
-$(THIRD_PARTY_REDPAJAMA_LLAMA).pkg:						\
-		$(THIRD_PARTY_REDPAJAMA_LLAMA_OBJS)				\
-		$(foreach x,$(THIRD_PARTY_REDPAJAMA_LLAMA_DIRECTDEPS),$($(x)_A).pkg)
-
-o/$(MODE)/third_party/radpajama/radpajama.txt.zip.o: private		\
-		ZIPOBJ_FLAGS +=						\
-			-B
+o/$(MODE)/third_party/radpajama/radpajama-chat.com.dbg:			\
+		o/$(MODE)/third_party/radpajama/main.pkg		\
+		$(THIRD_PARTY_RADPAJAMA_MAIN_DEPS)			\
+		o/$(MODE)/third_party/radpajama/main-redpajama-chat.o	\
+		$(CRT)							\
+		$(APE_NO_MODIFY_SELF)
+	@$(APELINK)
 
 ################################################################################
+# package level definitions
 
-THIRD_PARTY_REDPAJAMA_COMS = $(THIRD_PARTY_REDPAJAMA_LLAMA)
-THIRD_PARTY_REDPAJAMA_BINS = $(THIRD_PARTY_REDPAJAMA_COMS) $(THIRD_PARTY_REDPAJAMA_COMS:%=%.dbg)
-THIRD_PARTY_REDPAJAMA_LIBS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)))
-THIRD_PARTY_REDPAJAMA_SRCS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_SRCS))
-THIRD_PARTY_REDPAJAMA_HDRS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_HDRS))
-THIRD_PARTY_REDPAJAMA_OBJS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_OBJS))
-THIRD_PARTY_REDPAJAMA_CHECKS = $(foreach x,$(THIRD_PARTY_REDPAJAMA_ARTIFACTS),$($(x)_CHECKS))
-$(THIRD_PARTY_REDPAJAMA_OBJS): third_party/radpajama/radpajama.mk
+THIRD_PARTY_RADPAJAMA_LIBS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)))
+THIRD_PARTY_RADPAJAMA_COMS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_COMS))
+THIRD_PARTY_RADPAJAMA_BINS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_BINS))
+THIRD_PARTY_RADPAJAMA_SRCS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_SRCS))
+THIRD_PARTY_RADPAJAMA_HDRS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_HDRS))
+THIRD_PARTY_RADPAJAMA_OBJS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_OBJS))
+THIRD_PARTY_RADPAJAMA_CHECKS = $(foreach x,$(THIRD_PARTY_RADPAJAMA_ARTIFACTS),$($(x)_CHECKS))
+$(THIRD_PARTY_RADPAJAMA_OBJS): third_party/radpajama/radpajama.mk
 
 .PHONY: o/$(MODE)/third_party/radpajama
-o/$(MODE)/third_party/radpajama:						\
-		$(THIRD_PARTY_REDPAJAMA_BINS)				\
-		$(THIRD_PARTY_REDPAJAMA_CHECKS)
+o/$(MODE)/third_party/radpajama:					\
+		$(THIRD_PARTY_RADPAJAMA_BINS)				\
+		$(THIRD_PARTY_RADPAJAMA_CHECKS)
diff --git a/third_party/radpajama/scripts/convert_gptneox_to_ggml.py b/third_party/radpajama/scripts/convert_gptneox_to_ggml.py
index 6a3294261..1217c05dd 100644
--- a/third_party/radpajama/scripts/convert_gptneox_to_ggml.py
+++ b/third_party/radpajama/scripts/convert_gptneox_to_ggml.py
@@ -1,3 +1,4 @@
+// clang-format off
 # Convert Hugging Face fine-tuned gpt-neox-like models to ggml format
 
 import io
diff --git a/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh b/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
index 95be87af1..dca37488a 100644
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Base-3B-v1.sh
@@ -1,3 +1,4 @@
+// clang-format off
 #!/bin/bash
 
 # cd to scripts dir
diff --git a/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh b/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
index efa8de356..9e507afd1 100644
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Chat-3B-v1.sh
@@ -1,3 +1,4 @@
+// clang-format off
 #!/bin/bash
 
 # cd to scripts dir
diff --git a/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh b/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
index 3d6d70b9c..8e0f18765 100644
--- a/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
+++ b/third_party/radpajama/scripts/install-RedPajama-INCITE-Instruct-3B-v1.sh
@@ -1,3 +1,4 @@
+// clang-format off
 #!/bin/bash
 
 # cd to scripts dir
diff --git a/third_party/radpajama/scripts/quantize-gptneox.py b/third_party/radpajama/scripts/quantize-gptneox.py
index 629c1b411..ba291e9b2 100644
--- a/third_party/radpajama/scripts/quantize-gptneox.py
+++ b/third_party/radpajama/scripts/quantize-gptneox.py
@@ -1,3 +1,4 @@
+// clang-format off
 #!/usr/bin/env python3
 
 """Script to execute the "quantize" script on a given set of models."""
diff --git a/third_party/third_party.mk b/third_party/third_party.mk
index b6108057d..29062ce2f 100644
--- a/third_party/third_party.mk
+++ b/third_party/third_party.mk
@@ -14,7 +14,7 @@ o/$(MODE)/third_party:				\
 	o/$(MODE)/third_party/gdtoa		\
 	o/$(MODE)/third_party/getopt		\
 	o/$(MODE)/third_party/ggml		\
-#	o/$(MODE)/third_party/radpajama		\
+	o/$(MODE)/third_party/radpajama		\
 	o/$(MODE)/third_party/hiredis		\
 	o/$(MODE)/third_party/libcxx		\
 	o/$(MODE)/third_party/linenoise		\