diff --git a/.gitignore b/.gitignore
index c75e1cb5b..2ee9fb3be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,9 +21,6 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 
-models/*
-*.bin
-
 /main
 /quantize
 /quantize-stats
diff --git a/BLIS.md b/BLIS.md
deleted file mode 100644
index 9b3c30605..000000000
--- a/BLIS.md
+++ /dev/null
@@ -1,67 +0,0 @@
-BLIS Installation Manual
-------------------------
-
-BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
-
-Project URL: https://github.com/flame/blis
-
-### Prepare:
-
-Compile BLIS:
-
-```bash
-git clone https://github.com/flame/blis
-cd blis
-./configure --enable-cblas -t openmp,pthreads auto
-# will install to /usr/local/ by default.
-make -j
-```
-
-Install BLIS:
-
-```bash
-sudo make install
-```
-
-We recommend using openmp since it's easier to modify the cores been used.
-
-### llama.cpp compilation
-
-Makefile:
-
-```bash
-make LLAMA_BLIS=1 -j
-# make LLAMA_BLIS=1 benchmark-matmult
-```
-
-CMake:
-
-```bash
-mkdir build
-cd build
-cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
-make -j
-```
-
-### llama.cpp execution
-
-According to the BLIS documentation, we could set the following
-environment variables to modify the behavior of openmp:
-
-```
-export GOMP_GPU_AFFINITY="0-19"
-export BLIS_NUM_THREADS=14
-```
-
-And then run the binaries as normal.
-
-
-### Intel specific issue
-
-Some might get the error message saying that `libimf.so` cannot be found.
-Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
-
-### Reference:
-
-1. https://github.com/flame/blis#getting-started
-2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
diff --git a/README.md b/README.md
index 231bfa5c9..c423b4898 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A self contained distributable from Concedo that exposes llama.cpp function bind
 
 What does it mean? You get llama.cpp with a fancy UI, persistent stories, editing tools, save formats, memory, world info, author's note, characters, scenarios and everything Kobold and Kobold Lite have to offer. In a tiny package around 20 MB in size, excluding model weights.
 
-![Preview](preview.png)
+![Preview](media/preview.png)
 
 ## Usage
 - [Download the latest release here](https://github.com/LostRuins/koboldcpp/releases/latest) or clone the repo.
diff --git a/build.zig b/build.zig
deleted file mode 100644
index 306127ffe..000000000
--- a/build.zig
+++ /dev/null
@@ -1,61 +0,0 @@
-const std = @import("std");
-
-pub fn build(b: *std.build.Builder) void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardReleaseOptions();
-    const want_lto = b.option(bool, "lto", "Want -fLTO");
-
-    const lib = b.addStaticLibrary("llama", null);
-    lib.want_lto = want_lto;
-    lib.setTarget(target);
-    lib.setBuildMode(optimize);
-    lib.linkLibCpp();
-    lib.addIncludePath(".");
-    lib.addIncludePath("examples");
-    lib.addCSourceFiles(&.{
-        "ggml.c",
-    }, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{
-        "llama.cpp",
-    }, &.{"-std=c++11"});
-    lib.install();
-
-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
-
-    const exe = build_example("main", build_args);
-    _ = build_example("quantize", build_args);
-    _ = build_example("perplexity", build_args);
-    _ = build_example("embedding", build_args);
-
-    // create "zig build run" command for ./main
-
-    const run_cmd = exe.run();
-    run_cmd.step.dependOn(b.getInstallStep());
-    if (b.args) |args| {
-        run_cmd.addArgs(args);
-    }
-
-    const run_step = b.step("run", "Run the app");
-    run_step.dependOn(&run_cmd.step);
-}
-
-fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
-    const b = args.b;
-    const lib = args.lib;
-    const want_lto = args.want_lto;
-
-    const exe = b.addExecutable(name, null);
-    exe.want_lto = want_lto;
-    lib.setTarget(args.target);
-    lib.setBuildMode(args.optimize);
-    exe.addIncludePath(".");
-    exe.addIncludePath("examples");
-    exe.addCSourceFiles(&.{
-        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
-        "examples/common.cpp",
-    }, &.{"-std=c++11"});
-    exe.linkLibrary(lib);
-    exe.install();
-
-    return exe;
-}
diff --git a/media/llama-leader.jpeg b/media/llama-leader.jpeg
deleted file mode 100644
index 0b4e6e1cf..000000000
Binary files a/media/llama-leader.jpeg and /dev/null differ
diff --git a/media/llama0-banner.png b/media/llama0-banner.png
deleted file mode 100644
index cee3a87f1..000000000
Binary files a/media/llama0-banner.png and /dev/null differ
diff --git a/media/llama0-logo.png b/media/llama0-logo.png
deleted file mode 100644
index e55b38bd9..000000000
Binary files a/media/llama0-logo.png and /dev/null differ
diff --git a/media/llama1-banner.png b/media/llama1-banner.png
deleted file mode 100644
index 1e469584e..000000000
Binary files a/media/llama1-banner.png and /dev/null differ
diff --git a/media/llama1-logo.png b/media/llama1-logo.png
deleted file mode 100644
index 365c5b865..000000000
Binary files a/media/llama1-logo.png and /dev/null differ
diff --git a/preview.png b/media/preview.png
similarity index 100%
rename from preview.png
rename to media/preview.png
diff --git a/models/ggml-vocab.bin b/models/ggml-vocab.bin
deleted file mode 100644
index 38f63493a..000000000
Binary files a/models/ggml-vocab.bin and /dev/null differ
diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp
index 0458948d2..9e023a9d6 100644
--- a/otherarch/gpt2_v2.cpp
+++ b/otherarch/gpt2_v2.cpp
@@ -325,48 +325,6 @@ ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & mo
     fin.close();
 
 
-//     //gpu offload for gpt2
-// #if defined(GGML_USE_CLBLAST)
-//     if(gpulayers>0)
-//     {
-//         const auto & hparams = model.hparams;
-//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
-//         if(GetQuantsUnshuffled())
-//         {
-
-//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-
-//         size_t vram_total = 0;
-
-//         for (int i = 0; i < n_gpu; ++i) {
-//             const auto & layer = model.layers[i];
-            
-//             ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g);
-//             ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b);
-//             ggml_v2_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_v2_nbytes(layer.ln_2_g);
-//             ggml_v2_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_v2_nbytes(layer.ln_2_b);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_v2_nbytes(layer.c_attn_attn_w);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_v2_nbytes(layer.c_attn_attn_b);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_v2_nbytes(layer.c_attn_proj_b);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b);
-//         }
-
-//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-//         }
-//         else
-//         {
-//             if(n_gpu>0)
-//             {
-//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
-//             }
-//         }
-//     }
-// #endif
-
 
     return ModelLoadResult::SUCCESS;
 }
diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp
index 9cf78d9fe..b3d9a5666 100644
--- a/otherarch/gpt2_v3.cpp
+++ b/otherarch/gpt2_v3.cpp
@@ -337,50 +337,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
     fin.close();
 
 
-//     //gpu offload for gpt2
-// #if defined(GGML_USE_CLBLAST)
-//     if(gpulayers>0)
-//     {
-//         const auto & hparams = model.hparams;
-//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
-//         if(GetQuantsUnshuffled())
-//         {
-//         SetGPULayers(n_gpu);
-
-//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-
-//         size_t vram_total = 0;
-
-//         for (int i = 0; i < n_gpu; ++i) {
-//             const auto & layer = model.layers[i];
-            
-//             ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
-//             ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
-//             ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g);
-//             ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b);
-//             ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
-//             ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b);
-//             ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-//             ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b);
-//             ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-//             ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
-//             ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-//             ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
-//         }
-
-//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-//         }
-//         else
-//         {
-//             if(n_gpu>0)
-//             {
-//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
-//             }
-//         }
-//     }
-// #endif
-
-
     return ModelLoadResult::SUCCESS;
 }
 
diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp
index cfb48a8c6..100ca1ace 100644
--- a/otherarch/gptj_v2.cpp
+++ b/otherarch/gptj_v2.cpp
@@ -331,46 +331,6 @@ ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & mo
 
     fin.close();
 
-//         //gpu offload for gptj
-// #if defined(GGML_USE_CLBLAST)
-//     if(gpulayers>0)
-//     {
-//         const auto & hparams = model.hparams;
-//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
-//         if(GetQuantsUnshuffled())
-//         {
-
-//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-
-//         size_t vram_total = 0;
-
-//         for (int i = 0; i < n_gpu; ++i) {
-//             const auto & layer = model.layers[i];
-
-//             ggml_v2_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_v2_nbytes(layer.ln_1_g);
-//             ggml_v2_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_v2_nbytes(layer.ln_1_b);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_q_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_k_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_v_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_v2_nbytes(layer.c_attn_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_w);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_v2_nbytes(layer.c_mlp_fc_b);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_w);
-//             ggml_v2_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_v2_nbytes(layer.c_mlp_proj_b);
-//         }
-
-//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-//         }
-//         else
-//         {
-//             if(n_gpu>0)
-//             {
-//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
-//             }
-//         }
-//     }
-// #endif
-
 
     return ModelLoadResult::SUCCESS;
 }
diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp
index 893e6ebd6..894e7fb4d 100644
--- a/otherarch/gptj_v3.cpp
+++ b/otherarch/gptj_v3.cpp
@@ -331,46 +331,6 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
 
     fin.close();
 
-//         //gpu offload for gptj
-// #if defined(GGML_USE_CLBLAST)
-//     if(gpulayers>0)
-//     {
-//         const auto & hparams = model.hparams;
-//         const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
-//         if(GetQuantsUnshuffled())
-//         {
-//         SetGPULayers(n_gpu);
-
-//         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
-
-//         size_t vram_total = 0;
-
-//         for (int i = 0; i < n_gpu; ++i) {
-//             const auto & layer = model.layers[i];
-
-//             ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
-//             ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
-//             ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
-//             ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
-//             ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
-//             ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
-//             ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
-//             ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
-//             ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
-//             ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
-//         }
-
-//         fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
-//         }
-//         else
-//         {
-//             if(n_gpu>0)
-//             {
-//                 printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
-//             }
-//         }
-//     }
-// #endif
 
 
     return ModelLoadResult::SUCCESS;
diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt
deleted file mode 100644
index 03e1d2c04..000000000
--- a/pocs/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-# dependencies
-
-find_package(Threads REQUIRED)
-
-# third-party
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-if (EMSCRIPTEN)
-else()
-    add_subdirectory(vdot)
-endif()
diff --git a/pocs/vdot/CMakeLists.txt b/pocs/vdot/CMakeLists.txt
deleted file mode 100644
index fb89a1cd4..000000000
--- a/pocs/vdot/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-set(TARGET vdot)
-add_executable(${TARGET} vdot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-set(TARGET q8dot)
-add_executable(${TARGET} q8dot.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/pocs/vdot/q8dot.cpp b/pocs/vdot/q8dot.cpp
deleted file mode 100644
index 5748c8ac2..000000000
--- a/pocs/vdot/q8dot.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <cstdio>
-#include <type_traits>
-#include <vector>
-#include <random>
-#include <chrono>
-#include <cstdlib>
-#include <cmath>
-#include <cassert>
-#include <cstring>
-#include <array>
-#include <type_traits>
-
-#include <ggml.h>
-
-constexpr int kVecSize = 1 << 16;
-
-// Copy-pasted from ggml.c
-#define QK4_0 32
-typedef struct {
-    float   d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    float   d;          // delta
-    float   m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-// Copy-pasted from ggml.c
-#define QK8_0 32
-typedef struct {
-    float   d;          // delta
-    float   s;          // d * sum(qs[i])
-    int8_t  qs[QK8_0];  // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
-
-static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same");
-static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same");
-
-template <typename T>
-void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
-    for (auto& b : blocks) {
-        b.d = 1;
-        for (int i=0; i<QK4_1/2; ++i) {
-            uint8_t v1 = rndm() >> 28;
-            uint8_t v2 = rndm() >> 28;
-            b.qs[i] = v1 | (v2 << 4);
-        }
-    }
-}
-
-void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
-    for (auto& b : blocks) {
-        b.d = 1;
-        int sum = 0;
-        for (int i=0; i<QK8_0; ++i) {
-            b.qs[i] = (rndm() >> 24) - 128;
-            sum += b.qs[i];
-        }
-        b.s = b.d * sum;
-    }
-}
-
-float simpleDot(const block_q4_0& x, const block_q8_0& y) {
-    int s1 = 0; //, s2 = 0;
-    for (int i=0; i<QK4_1/2; i+=2) {
-        int v1 = x.qs[i+0] & 0xf;
-        int v2 = x.qs[i+0] >> 4;
-        int v3 = x.qs[i+1] & 0xf;
-        int v4 = x.qs[i+1] >> 4;
-        int j = 2*i;
-        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
-        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
-    }
-    return y.d * x.d * s1 - 8 * x.d * y.s;
-    //return y.d * x.d * (s1 - 8 * s2);
-}
-
-float simpleDot(const block_q4_1& x, const block_q8_0& y) {
-    int s1 = 0; //, s2 = 0;
-    for (int i=0; i<QK4_1/2; i+=2) {
-        int v1 = x.qs[i+0] & 0xf;
-        int v2 = x.qs[i+0] >> 4;
-        int v3 = x.qs[i+1] & 0xf;
-        int v4 = x.qs[i+1] >> 4;
-        int j = 2*i;
-        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
-        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
-    }
-    return y.d * x.d * s1 + y.s * x.m;
-    //return y.d * (x.d * s1 + x.m * s2);
-}
-
-struct Stat {
-    double sum = 0, sumt = 0, sumt2 = 0, maxt = 0;
-    int nloop = 0;
-    void addResult(double s, double t) {
-        sum += s;
-        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
-        ++nloop;
-    }
-    void reportResult(const char* title) const {
-        if (nloop < 1) {
-            printf("%s(%s): no result\n",__func__,title);
-            return;
-        }
-        printf("============ %s\n",title);
-        printf("<dot> = %g\n",sum/nloop);
-        auto t = sumt/nloop, dt = sumt2/nloop - t*t;
-        if (dt > 0) dt = sqrt(dt);
-        printf("<time> = %g +/- %g us. Max. time = %g us.\n",t,dt,maxt);
-    }
-};
-
-
-int main(int argc, char** argv) {
-
-    int nloop = argc > 1 ? atoi(argv[1]) : 10;
-    int type  = argc > 2 ? atoi(argv[2]) : 1;
-
-    std::mt19937 rndm(1234);
-
-    std::vector<block_q4_1> x41;
-    std::vector<block_q4_0> x40;
-    std::vector<block_q8_0> y(kVecSize);
-    if (type == 0) x40.resize(kVecSize);
-    else {
-        x41.resize(kVecSize);
-        for (auto& b : x41) b.m = 1;
-    }
-
-    auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
-
-    auto funcs = ggml_internal_get_quantize_fn(ggml_type);
-
-    Stat simple, ggml;
-
-    for (int iloop=0; iloop<nloop; ++iloop) {
-
-        if (type == 0) fillQ4blocks(x40, rndm);
-        else fillQ4blocks(x41, rndm);
-        fillQ80blocks(y, rndm);
-
-        auto t1 = std::chrono::high_resolution_clock::now();
-        double s = 0;
-        if (type == 0) for (int i=0; i<kVecSize; ++i) s += simpleDot(x40[i], y[i]);
-        else for (int i=0; i<kVecSize; ++i) s += simpleDot(x41[i], y[i]);
-        auto t2 = std::chrono::high_resolution_clock::now();
-        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
-        if (iloop > 3) simple.addResult(s, t);
-
-        t1 = std::chrono::high_resolution_clock::now();
-        float fs;
-        if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
-        else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
-        t2 = std::chrono::high_resolution_clock::now();
-        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
-        if (iloop > 3) ggml.addResult(fs, t);
-
-    }
-
-    // Report the time (and the average of the dot products so the compiler does not come up with the idea
-    // of optimizing away the function calls after figuring that the result is not used).
-    simple.reportResult("Simple");
-    ggml.reportResult("ggml");
-    return 0;
-}
diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp
deleted file mode 100644
index 26bf50c9a..000000000
--- a/pocs/vdot/vdot.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-#include <cstdio>
-#include <vector>
-#include <random>
-#include <chrono>
-#include <cstdlib>
-#include <cmath>
-#include <cassert>
-#include <cstring>
-#include <array>
-
-#include <ggml.h>
-
-constexpr int kVecSize = 1 << 18;
-
-float drawFromGaussianPdf(std::mt19937& rndm) {
-    constexpr double kScale = 1./(1. + std::mt19937::max());
-    constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
-    static float lastX;
-    static bool haveX = false;
-    if (haveX) { haveX = false; return lastX; }
-    auto r = sqrt(-2*log(1 - kScale*rndm()));
-    auto phi = kTwoPiTimesScale * rndm();
-    lastX = r*sin(phi);
-    haveX = true;
-    return r*cos(phi);
-}
-void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
-    for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
-}
-
-// Copy-pasted from ggml.c
-#define QK4_0 32
-typedef struct {
-    float   d;          // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-typedef struct {
-    float   d;          // delta
-    float   m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-// Copy-pasted from ggml.c
-#define QK8_0 32
-typedef struct {
-    float   d;          // delta
-    int8_t  qs[QK8_0];  // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
-
-// "Scalar" dot product between the quantized vector x and float vector y
-inline double dot(int n, const block_q4_0* x, const float* y) {
-    const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
-    constexpr uint32_t kMask1 = 0x0f0f0f0f;
-    uint32_t u1, u2;
-    auto q1 = (const uint8_t*)&u1;
-    auto q2 = (const uint8_t*)&u2;
-    double sum = 0;
-    for (int i=0; i<n; ++i) {
-        float d = x->d;
-        auto u = (const uint32_t*)x->qs;
-        float s = 0;
-        for (int k=0; k<4; ++k) {
-            u1 = u[k] & kMask1;
-            u2 = (u[k] >> 4) & kMask1;
-            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
-                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
-                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
-                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
-            y += 8;
-        }
-        sum += s*d;
-        ++x;
-    }
-    return sum;
-}
-// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
-// but about the same on X86_64 (Ryzen 7950X CPU).
-inline double dot3(int n, const block_q4_0* x, const float* y) {
-    const static std::pair<float,float> kValues[256] = {
-        {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
-        { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
-        {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
-        { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
-        {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
-        { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
-        {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
-        { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
-        {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
-        { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
-        {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
-        { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
-        {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
-        { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
-        {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
-        { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
-        {-8.f,  0.f}, {-7.f,  0.f}, {-6.f,  0.f}, {-5.f,  0.f}, {-4.f,  0.f}, {-3.f,  0.f}, {-2.f,  0.f}, {-1.f,  0.f},
-        { 0.f,  0.f}, { 1.f,  0.f}, { 2.f,  0.f}, { 3.f,  0.f}, { 4.f,  0.f}, { 5.f,  0.f}, { 6.f,  0.f}, { 7.f,  0.f},
-        {-8.f,  1.f}, {-7.f,  1.f}, {-6.f,  1.f}, {-5.f,  1.f}, {-4.f,  1.f}, {-3.f,  1.f}, {-2.f,  1.f}, {-1.f,  1.f},
-        { 0.f,  1.f}, { 1.f,  1.f}, { 2.f,  1.f}, { 3.f,  1.f}, { 4.f,  1.f}, { 5.f,  1.f}, { 6.f,  1.f}, { 7.f,  1.f},
-        {-8.f,  2.f}, {-7.f,  2.f}, {-6.f,  2.f}, {-5.f,  2.f}, {-4.f,  2.f}, {-3.f,  2.f}, {-2.f,  2.f}, {-1.f,  2.f},
-        { 0.f,  2.f}, { 1.f,  2.f}, { 2.f,  2.f}, { 3.f,  2.f}, { 4.f,  2.f}, { 5.f,  2.f}, { 6.f,  2.f}, { 7.f,  2.f},
-        {-8.f,  3.f}, {-7.f,  3.f}, {-6.f,  3.f}, {-5.f,  3.f}, {-4.f,  3.f}, {-3.f,  3.f}, {-2.f,  3.f}, {-1.f,  3.f},
-        { 0.f,  3.f}, { 1.f,  3.f}, { 2.f,  3.f}, { 3.f,  3.f}, { 4.f,  3.f}, { 5.f,  3.f}, { 6.f,  3.f}, { 7.f,  3.f},
-        {-8.f,  4.f}, {-7.f,  4.f}, {-6.f,  4.f}, {-5.f,  4.f}, {-4.f,  4.f}, {-3.f,  4.f}, {-2.f,  4.f}, {-1.f,  4.f},
-        { 0.f,  4.f}, { 1.f,  4.f}, { 2.f,  4.f}, { 3.f,  4.f}, { 4.f,  4.f}, { 5.f,  4.f}, { 6.f,  4.f}, { 7.f,  4.f},
-        {-8.f,  5.f}, {-7.f,  5.f}, {-6.f,  5.f}, {-5.f,  5.f}, {-4.f,  5.f}, {-3.f,  5.f}, {-2.f,  5.f}, {-1.f,  5.f},
-        { 0.f,  5.f}, { 1.f,  5.f}, { 2.f,  5.f}, { 3.f,  5.f}, { 4.f,  5.f}, { 5.f,  5.f}, { 6.f,  5.f}, { 7.f,  5.f},
-        {-8.f,  6.f}, {-7.f,  6.f}, {-6.f,  6.f}, {-5.f,  6.f}, {-4.f,  6.f}, {-3.f,  6.f}, {-2.f,  6.f}, {-1.f,  6.f},
-        { 0.f,  6.f}, { 1.f,  6.f}, { 2.f,  6.f}, { 3.f,  6.f}, { 4.f,  6.f}, { 5.f,  6.f}, { 6.f,  6.f}, { 7.f,  6.f},
-        {-8.f,  7.f}, {-7.f,  7.f}, {-6.f,  7.f}, {-5.f,  7.f}, {-4.f,  7.f}, {-3.f,  7.f}, {-2.f,  7.f}, {-1.f,  7.f},
-        { 0.f,  7.f}, { 1.f,  7.f}, { 2.f,  7.f}, { 3.f,  7.f}, { 4.f,  7.f}, { 5.f,  7.f}, { 6.f,  7.f}, { 7.f,  7.f}
-    };
-    double sum = 0;
-    for (int i=0; i<n; ++i) {
-        float d = x->d;
-        auto q = x->qs;
-        float s = 0;
-        for (int k=0; k<4; ++k) {
-            s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
-                 y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
-                 y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
-                 y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
-            y += 8; q += 4;
-        }
-        sum += s*d;
-        ++x;
-    }
-    return sum;
-}
-
-inline double dot41(int n, const block_q4_1* x, const float* y) {
-    const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
-    constexpr uint32_t kMask1 = 0x0f0f0f0f;
-    uint32_t u1, u2;
-    auto q1 = (const uint8_t*)&u1;
-    auto q2 = (const uint8_t*)&u2;
-    double sum = 0;
-    for (int i=0; i<n; ++i) {
-        auto u = (const uint32_t*)x->qs;
-        float s = 0, s1 = 0;
-        for (int k=0; k<4; ++k) {
-            u1 = u[k] & kMask1;
-            u2 = (u[k] >> 4) & kMask1;
-            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
-                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
-                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
-                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
-            s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
-            y += 8;
-        }
-        sum += s*x->d + s1*x->m;
-        ++x;
-    }
-    return sum;
-}
-
-// Copy-pasted from ggml.c
-static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
-    assert(k % QK8_0 == 0);
-    const int nb = k / QK8_0;
-
-    for (int i = 0; i < nb; i++) {
-        float amax = 0.0f; // absolute max
-
-        for (int l = 0; l < QK8_0; l++) {
-            const float v = x[i*QK8_0 + l];
-            amax = std::max(amax, fabsf(v));
-        }
-
-        const float d = amax / ((1 << 7) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
-
-        y[i].d = d;
-
-        for (int l = 0; l < QK8_0; ++l) {
-            const float   v  = x[i*QK8_0 + l]*id;
-            y[i].qs[l] = roundf(v);
-        }
-    }
-}
-
-// Copy-pasted from ggml.c
-static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
-    const int nb = n / QK8_0;
-    const block_q4_0* x = (const block_q4_0*)vx;
-    const block_q8_0* y = (const block_q8_0*)vy;
-    float sumf = 0;
-    for (int i = 0; i < nb; i++) {
-        const float d0 = x[i].d;
-        const float d1 = y[i].d;
-
-        const uint8_t * p0 = x[i].qs;
-        const  int8_t * p1 = y[i].qs;
-
-        int sumi = 0;
-        for (int j = 0; j < QK8_0/2; j++) {
-            const uint8_t v0 = p0[j];
-
-            const int i0 = (int8_t) (v0 & 0xf) - 8;
-            const int i1 = (int8_t) (v0 >> 4)  - 8;
-
-            const int i2 = p1[2*j + 0];
-            const int i3 = p1[2*j + 1];
-
-            sumi += i0*i2 + i1*i3;
-        }
-        sumf += d0*d1*sumi;
-    }
-    *s = sumf;
-}
-
-int main(int argc, char** argv) {
-
-    int nloop = argc > 1 ? atoi(argv[1]) : 10;
-    bool scalar = argc > 2 ? atoi(argv[2]) : false;
-    bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
-
-    if (scalar && useQ4_1) {
-        printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
-        return 1;
-    }
-
-    std::mt19937 rndm(1234);
-
-    std::vector<float> x1(kVecSize), y1(kVecSize);
-    int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
-    int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
-
-    auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
-
-    std::vector<block_q4_0> q40;
-    std::vector<block_q4_1> q41;
-    if (useQ4_1) q41.resize(n4);
-    else q40.resize(n4);
-    std::vector<block_q8_0> q8(n8);
-    std::vector<int64_t> H(16, 0);
-    double sumt = 0, sumt2 = 0, maxt = 0;
-    double sumqt = 0, sumqt2 = 0, maxqt = 0;
-    double sum = 0, sumq = 0, exactSum = 0;
-    for (int iloop=0; iloop<nloop; ++iloop) {
-
-        // Fill vector x with random numbers
-        fillRandomGaussianFloats(x1, rndm);
-
-        // Fill vector y with random numbers
-        fillRandomGaussianFloats(y1, rndm);
-
-        // Compute the exact dot product
-        for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
-
-        // quantize x.
-        // Note, we do not include this in the timing as in practical application
-        // we already have the quantized model weights.
-        if (useQ4_1) {
-            funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
-        } else {
-            funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
-        }
-
-        // Now measure time the dot product needs using the "scalar" version above
-        auto t1 = std::chrono::high_resolution_clock::now();
-        if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
-        else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
-        auto t2 = std::chrono::high_resolution_clock::now();
-        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
-        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
-
-        // And now measure the time needed to quantize y and perform the dot product with the quantized y
-        t1 = std::chrono::high_resolution_clock::now();
-        float result;
-        if (scalar) {
-            quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
-            dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
-        }
-        else {
-            funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
-            if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
-            else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
-        }
-        sumq += result;
-        t2 = std::chrono::high_resolution_clock::now();
-        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
-        sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
-
-    }
-
-    // Report the time (and the average of the dot products so the compiler does not come up with the idea
-    // of optimizing away the function calls after figuring that the result is not used).
-    sum /= nloop; sumq /= nloop;
-    exactSum /= nloop;
-    printf("Exact result: <dot> = %g\n",exactSum);
-    printf("<dot> = %g, %g\n",sum,sumq);
-    sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
-    if (sumt2 > 0) sumt2 = sqrt(sumt2);
-    printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
-    sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
-    if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
-    printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
-    return 0;
-}
diff --git a/spm-headers/llama.h b/spm-headers/llama.h
deleted file mode 120000
index 9acceb980..000000000
--- a/spm-headers/llama.h
+++ /dev/null
@@ -1 +0,0 @@
-../llama.h
\ No newline at end of file