From ba15dfd0be8d08390fe29c88a8e82b1089af3a4c Mon Sep 17 00:00:00 2001
From: niansa <anton-sa@web.de>
Date: Thu, 22 Jun 2023 12:58:07 +0200
Subject: [PATCH 01/93] Nomic vulkan backend licensed under the Software for
 Open Models License (SOM), version 1.0.

---
 .gitmodules                                   |    0
 CMakeLists.txt                                |  123 ++
 LICENSE_SOM.txt                               |   30 +
 examples/main/main.cpp                        |    8 +
 ggml-vulkan.cpp                               | 1313 +++++++++++++++++
 ggml-vulkan.h                                 |   61 +
 ggml.c                                        |   32 +-
 kompute/.ccls                                 |   27 +
 kompute/.clang-format                         |    5 +
 kompute/.dockerignore                         |    4 +
 kompute/.github/workflows/cpp_examples.yml    |   58 +
 kompute/.github/workflows/cpp_tests.yml       |  104 ++
 kompute/.github/workflows/python_tests.yml    |   28 +
 kompute/CMakeLists.txt                        |  187 +++
 kompute/LICENSE                               |  203 +++
 kompute/Makefile                              |  210 +++
 kompute/README.md                             |  513 +++++++
 kompute/cmake/bin2h.cmake                     |  106 ++
 kompute/cmake/bin_file_to_header.cmake        |   19 +
 kompute/cmake/check_vulkan_version.cmake      |  139 ++
 kompute/cmake/code_coverage.cmake             |   35 +
 kompute/cmake/deprecation_warnings.cmake      |   15 +
 kompute/cmake/komputeConfig.cmake.in          |    8 +
 kompute/cmake/vulkan_shader_compiler.cmake    |   43 +
 kompute/config/FindSphinx.cmake               |   16 +
 kompute/external/bin/xxd.c                    |  819 ++++++++++
 kompute/kompute-config.cmake                  |   28 +
 kompute/op_add.comp                           |  145 ++
 kompute/op_addrow.comp                        |  145 ++
 kompute/op_cpy_f16_f16.comp                   |  176 +++
 kompute/op_cpy_f16_f32.comp                   |  176 +++
 kompute/op_cpy_f32_f16.comp                   |  176 +++
 kompute/op_cpy_f32_f32.comp                   |  168 +++
 kompute/op_diagmask.comp                      |  153 ++
 kompute/op_gelu.comp                          |  142 ++
 kompute/op_getrows_f16.comp                   |  150 ++
 kompute/op_getrows_q4_0.comp                  |  179 +++
 kompute/op_getrows_q4_1.comp                  |  181 +++
 kompute/op_mul.comp                           |  145 ++
 kompute/op_mul_mat_f16.comp                   |  177 +++
 kompute/op_mul_mat_q4_0.comp                  |  195 +++
 kompute/op_mul_mat_q4_1.comp                  |  218 +++
 kompute/op_mulrow.comp                        |  145 ++
 kompute/op_norm.comp                          |  209 +++
 kompute/op_relu.comp                          |  141 ++
 kompute/op_rmsnorm.comp                       |  178 +++
 kompute/op_rope.comp                          |  183 +++
 kompute/op_scale.comp                         |  142 ++
 kompute/op_silu.comp                          |  141 ++
 kompute/op_softmax.comp                       |  197 +++
 kompute/scripts/convert_shaders.py            |  148 ++
 kompute/scripts/requirements.txt              |   11 +
 kompute/setup.py                              |   93 ++
 kompute/src/Algorithm.cpp                     |  450 ++++++
 kompute/src/CMakeLists.txt                    |   82 +
 kompute/src/Core.cpp                          |   27 +
 kompute/src/Manager.cpp                       |  493 +++++++
 kompute/src/OpAlgoDispatch.cpp                |   65 +
 kompute/src/OpBufferSyncDevice.cpp            |   51 +
 kompute/src/OpBufferSyncLocal.cpp             |   51 +
 kompute/src/OpMemoryBarrier.cpp               |   74 +
 kompute/src/OpTensorCopy.cpp                  |   90 ++
 kompute/src/OpTensorSyncDevice.cpp            |   61 +
 kompute/src/OpTensorSyncLocal.cpp             |   76 +
 kompute/src/Sequence.cpp                      |  396 +++++
 kompute/src/Tensor.cpp                        |  451 ++++++
 kompute/src/include/CMakeLists.txt            |   46 +
 kompute/src/include/kompute/Algorithm.hpp     |  338 +++++
 kompute/src/include/kompute/Core.hpp          |   39 +
 kompute/src/include/kompute/Kompute.hpp       |   21 +
 kompute/src/include/kompute/Manager.hpp       |  267 ++++
 kompute/src/include/kompute/Sequence.hpp      |  313 ++++
 kompute/src/include/kompute/Tensor.hpp        |  306 ++++
 kompute/src/include/kompute/logger/Logger.hpp |  197 +++
 .../kompute/operations/OpAlgoDispatch.hpp     |   86 ++
 .../src/include/kompute/operations/OpBase.hpp |   62 +
 .../kompute/operations/OpBufferSyncDevice.hpp |   50 +
 .../kompute/operations/OpBufferSyncLocal.hpp  |   50 +
 .../kompute/operations/OpMemoryBarrier.hpp    |   81 +
 .../src/include/kompute/operations/OpMult.hpp |   58 +
 .../kompute/operations/OpTensorCopy.hpp       |   63 +
 .../kompute/operations/OpTensorSyncDevice.hpp |   66 +
 .../kompute/operations/OpTensorSyncLocal.hpp  |   66 +
 kompute/src/logger/CMakeLists.txt             |   69 +
 kompute/src/logger/Logger.cpp                 |  101 ++
 kompute/src/shaders/CMakeLists.txt            |    5 +
 kompute/src/shaders/glsl/CMakeLists.txt       |   26 +
 .../glsl/ShaderLogisticRegression.comp        |   52 +
 .../glsl/ShaderLogisticRegression.hpp.in      |  310 ++++
 kompute/src/shaders/glsl/ShaderOpMult.comp    |   28 +
 kompute/src/shaders/glsl/ShaderOpMult.hpp.in  |  101 ++
 kompute/src/shaders/hlsl/computeheadless.comp |   29 +
 llama.cpp                                     |   47 +-
 llama.h                                       |    2 +-
 undump.py                                     |   18 +
 95 files changed, 13489 insertions(+), 23 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 LICENSE_SOM.txt
 create mode 100644 ggml-vulkan.cpp
 create mode 100644 ggml-vulkan.h
 create mode 100644 kompute/.ccls
 create mode 100644 kompute/.clang-format
 create mode 100644 kompute/.dockerignore
 create mode 100644 kompute/.github/workflows/cpp_examples.yml
 create mode 100644 kompute/.github/workflows/cpp_tests.yml
 create mode 100644 kompute/.github/workflows/python_tests.yml
 create mode 100644 kompute/CMakeLists.txt
 create mode 100644 kompute/LICENSE
 create mode 100644 kompute/Makefile
 create mode 100644 kompute/README.md
 create mode 100644 kompute/cmake/bin2h.cmake
 create mode 100644 kompute/cmake/bin_file_to_header.cmake
 create mode 100644 kompute/cmake/check_vulkan_version.cmake
 create mode 100644 kompute/cmake/code_coverage.cmake
 create mode 100644 kompute/cmake/deprecation_warnings.cmake
 create mode 100644 kompute/cmake/komputeConfig.cmake.in
 create mode 100644 kompute/cmake/vulkan_shader_compiler.cmake
 create mode 100644 kompute/config/FindSphinx.cmake
 create mode 100644 kompute/external/bin/xxd.c
 create mode 100644 kompute/kompute-config.cmake
 create mode 100644 kompute/op_add.comp
 create mode 100644 kompute/op_addrow.comp
 create mode 100644 kompute/op_cpy_f16_f16.comp
 create mode 100644 kompute/op_cpy_f16_f32.comp
 create mode 100644 kompute/op_cpy_f32_f16.comp
 create mode 100644 kompute/op_cpy_f32_f32.comp
 create mode 100644 kompute/op_diagmask.comp
 create mode 100644 kompute/op_gelu.comp
 create mode 100644 kompute/op_getrows_f16.comp
 create mode 100644 kompute/op_getrows_q4_0.comp
 create mode 100644 kompute/op_getrows_q4_1.comp
 create mode 100644 kompute/op_mul.comp
 create mode 100644 kompute/op_mul_mat_f16.comp
 create mode 100644 kompute/op_mul_mat_q4_0.comp
 create mode 100644 kompute/op_mul_mat_q4_1.comp
 create mode 100644 kompute/op_mulrow.comp
 create mode 100644 kompute/op_norm.comp
 create mode 100644 kompute/op_relu.comp
 create mode 100644 kompute/op_rmsnorm.comp
 create mode 100644 kompute/op_rope.comp
 create mode 100644 kompute/op_scale.comp
 create mode 100644 kompute/op_silu.comp
 create mode 100644 kompute/op_softmax.comp
 create mode 100644 kompute/scripts/convert_shaders.py
 create mode 100644 kompute/scripts/requirements.txt
 create mode 100644 kompute/setup.py
 create mode 100644 kompute/src/Algorithm.cpp
 create mode 100644 kompute/src/CMakeLists.txt
 create mode 100644 kompute/src/Core.cpp
 create mode 100644 kompute/src/Manager.cpp
 create mode 100644 kompute/src/OpAlgoDispatch.cpp
 create mode 100644 kompute/src/OpBufferSyncDevice.cpp
 create mode 100644 kompute/src/OpBufferSyncLocal.cpp
 create mode 100644 kompute/src/OpMemoryBarrier.cpp
 create mode 100644 kompute/src/OpTensorCopy.cpp
 create mode 100644 kompute/src/OpTensorSyncDevice.cpp
 create mode 100644 kompute/src/OpTensorSyncLocal.cpp
 create mode 100644 kompute/src/Sequence.cpp
 create mode 100644 kompute/src/Tensor.cpp
 create mode 100644 kompute/src/include/CMakeLists.txt
 create mode 100644 kompute/src/include/kompute/Algorithm.hpp
 create mode 100644 kompute/src/include/kompute/Core.hpp
 create mode 100644 kompute/src/include/kompute/Kompute.hpp
 create mode 100644 kompute/src/include/kompute/Manager.hpp
 create mode 100644 kompute/src/include/kompute/Sequence.hpp
 create mode 100644 kompute/src/include/kompute/Tensor.hpp
 create mode 100644 kompute/src/include/kompute/logger/Logger.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBase.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpMult.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorCopy.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
 create mode 100644 kompute/src/logger/CMakeLists.txt
 create mode 100644 kompute/src/logger/Logger.cpp
 create mode 100644 kompute/src/shaders/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/CMakeLists.txt
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.comp
 create mode 100644 kompute/src/shaders/glsl/ShaderOpMult.hpp.in
 create mode 100644 kompute/src/shaders/hlsl/computeheadless.comp
 create mode 100644 undump.py

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..e69de29bb
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c4a649a97..88585fb93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,7 @@ option(LLAMA_HIPBLAS                         "llama: use hipBLAS"
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -412,6 +413,127 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+if (LLAMA_KOMPUTE)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        set(spv_file ${source}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            COMMENT "Compiling ${source} to ${source}.spv"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        add_custom_command(
+          OUTPUT ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          DEPENDS ${spv_file}
+          COMMENT "Converting to hpp: ${FILE_NAME}"
+        )
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        add_subdirectory(kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute/op_scale.comp
+          kompute/op_add.comp
+          kompute/op_addrow.comp
+          kompute/op_mul.comp
+          kompute/op_mulrow.comp
+          kompute/op_silu.comp
+          kompute/op_relu.comp
+          kompute/op_gelu.comp
+          kompute/op_softmax.comp
+          kompute/op_norm.comp
+          kompute/op_rmsnorm.comp
+          kompute/op_diagmask.comp
+          kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q4_0.comp
+          kompute/op_mul_mat_q4_1.comp
+          kompute/op_getrows_f16.comp
+          kompute/op_getrows_q4_0.comp
+          kompute/op_getrows_q4_1.comp
+          kompute/op_rope.comp
+          kompute/op_cpy_f16_f16.comp
+          kompute/op_cpy_f16_f32.comp
+          kompute/op_cpy_f32_f16.comp
+          kompute/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_mulrow.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_rope.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags
@@ -648,6 +770,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/LICENSE_SOM.txt b/LICENSE_SOM.txt
new file mode 100644
index 000000000..eb912c0fd
--- /dev/null
+++ b/LICENSE_SOM.txt
@@ -0,0 +1,30 @@
+Software for Open Models License (SOM)
+Version 1.0 dated August 30th, 2023
+
+This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
+
+This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
+
+1. Definitions
+The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
+A “Model” is the output of a machine learning algorithm, and excludes the Software.
+“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
+“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
+
+2. Grant of Rights. Subject to the conditions and limitations in section 3:
+(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
+
+(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
+
+3. Conditions and Limitations
+(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
+
+(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
+
+(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
+
+(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
+
+(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
+
+(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d78112260..16f8fc72b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -33,6 +33,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+#endif
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
@@ -171,6 +175,10 @@ int main(int argc, char ** argv) {
     g_model = &model;
     g_ctx = &ctx;
 
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_init_device(0, "gpu");
+#endif
+
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
new file mode 100644
index 000000000..32590d03e
--- /dev/null
+++ b/ggml-vulkan.cpp
@@ -0,0 +1,1313 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "ggml-vulkan.h"
+#include "ggml.h"
+
+// These are generated at build time by cmake custom command
+#include "shaderop_scale.h"
+#include "shaderop_add.h"
+#include "shaderop_addrow.h"
+#include "shaderop_mul.h"
+#include "shaderop_mulrow.h"
+#include "shaderop_silu.h"
+#include "shaderop_relu.h"
+#include "shaderop_gelu.h"
+#include "shaderop_softmax.h"
+#include "shaderop_norm.h"
+#include "shaderop_rmsnorm.h"
+#include "shaderop_diagmask.h"
+#include "shaderop_mul_mat_f16.h"
+#include "shaderop_mul_mat_q4_0.h"
+#include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_getrows_f16.h"
+#include "shaderop_getrows_q4_0.h"
+#include "shaderop_getrows_q4_1.h"
+#include "shaderop_rope.h"
+#include "shaderop_cpy_f16_f16.h"
+#include "shaderop_cpy_f16_f32.h"
+#include "shaderop_cpy_f32_f16.h"
+#include "shaderop_cpy_f32_f32.h"
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include <fstream>
+#include <exception>
+#include <thread>
+#include <mutex>
+#include <atomic>
+#include <cstring>
+#include <immintrin.h>
+#include <kompute/Kompute.hpp>
+
+#ifndef __STDC_IEC_559__
+#warning Your C implementation does not seem to be IEC 559 compliant, which is required for proper Vulkan interop.
+#endif
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+typedef ggml_fp16_t half;
+
+struct ggml_kompute_context {
+    bool hasH2DAll = false;
+    std::vector<ggml_vk_memory> buffers;
+    std::shared_ptr<vk::DescriptorPool> pool;
+    static ggml_kompute_context *instance;
+    ggml_kompute_context() {
+        instance = this;
+    }
+};
+
+ggml_kompute_context *ggml_kompute_context::instance;
+
+kp::Manager mgr;
+
+#ifdef __linux__
+__attribute__((constructor))
+static void enable_sam() {
+    setenv("RADV_PERFTEST", "sam", false);
+}
+#endif
+
+static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physicalDevice) {
+    vk::PhysicalDeviceFeatures availableFeatures;
+    physicalDevice.getFeatures(&availableFeatures);
+
+    if (!availableFeatures.shaderInt16)
+        return false;
+
+    vk::PhysicalDeviceVulkan11Features availableFeatures11;
+    vk::PhysicalDeviceVulkan12Features availableFeatures12;
+
+    availableFeatures11.pNext = &availableFeatures12;
+    availableFeatures12.pNext = nullptr;
+
+    vk::PhysicalDeviceFeatures2 features2;
+    features2.pNext = &availableFeatures11;
+
+    physicalDevice.getFeatures2(&features2);
+
+    if (!availableFeatures11.uniformAndStorageBuffer16BitAccess ||
+        !availableFeatures11.storageBuffer16BitAccess) {
+        return false;
+    }
+
+    if (!availableFeatures12.storageBuffer8BitAccess ||
+        !availableFeatures12.uniformAndStorageBuffer8BitAccess ||
+        !availableFeatures12.shaderFloat16 ||
+        !availableFeatures12.shaderInt8) {
+        return false;
+    }
+
+    return true;
+}
+
+static std::string ggml_vk_getVendorName(uint32_t vendorID) {
+    switch (vendorID) {
+        case 0x10DE:
+            return "nvidia";
+        case 0x1002:
+            return "amd";
+        case 0x8086:
+            return "intel";
+        default:
+            return "unknown";
+    }
+}
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
+    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    std::vector<ggml_vk_device> results;
+
+    if (deviceCount == 0)
+        return results;
+
+    for (uint32_t i = 0; i < deviceCount; i++) {
+        VkPhysicalDeviceProperties properties;
+        vkGetPhysicalDeviceProperties(physicalDevices.at(i), &properties);
+
+        VkPhysicalDeviceMemoryProperties memoryProperties;
+        vkGetPhysicalDeviceMemoryProperties(physicalDevices.at(i), &memoryProperties);
+
+        const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
+        const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
+        if (major < 1 || minor < 2)
+            continue;
+
+        if (!ggml_vk_checkPhysicalDeviceFeatures(physicalDevices.at(i)))
+            continue;
+
+        size_t heapSize = 0;
+        for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) {
+            VkMemoryHeap heap = memoryProperties.memoryHeaps[j];
+            if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
+                heapSize = heap.size;
+                break;
+            }
+        }
+
+        if (heapSize < memoryRequired)
+            continue;
+
+        ggml_vk_device d;
+        d.index = i;
+        d.type = properties.deviceType;
+        d.heapSize = heapSize;
+        d.name = properties.deviceName;
+        d.vendor = ggml_vk_getVendorName(properties.vendorID);
+        results.push_back(d);
+    }
+
+    std::stable_sort(results.begin(), results.end(),
+        [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool {
+            if (lhs.type != rhs.type) {
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false;
+
+                if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true;
+                if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false;
+            }
+            return lhs.heapSize < rhs.heapSize;
+        }
+    );
+
+    return results;
+}
+
+static void ggml_vk_filterByVendor(std::vector<ggml_vk_device>& devices, const std::string& targetVendor) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetVendor](const ggml_vk_device& device) {
+                return device.vendor != targetVendor;
+            }),
+        devices.end()
+    );
+}
+
+static void ggml_vk_filterByName(std::vector<ggml_vk_device>& devices, const std::string& targetName) {
+    devices.erase(
+        std::remove_if(devices.begin(), devices.end(),
+            [&targetName](const ggml_vk_device& device) {
+                return device.name != targetName;
+            }),
+        devices.end()
+    );
+}
+
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device) {
+    if (device.empty())
+        return false;
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(memoryRequired);
+    if (device == "gpu") {
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else if (device == "amd" || device == "nvidia" || device == "intel") {
+        ggml_vk_filterByVendor(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    } else {
+        ggml_vk_filterByName(devices, device);
+        if (devices.size() != 0)
+            return ggml_vk_init_device(devices.front());
+    }
+
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_init_device(const ggml_vk_device &device) {
+    return ggml_vk_init_device(device.index);
+}
+
+bool ggml_vk_init_device(int device) {
+    mgr.initializeDevice(device, {},
+                         {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
+                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+    return ggml_vk_has_device();
+}
+
+bool ggml_vk_has_device() {
+    return mgr.hasDevice();
+}
+
+ggml_vk_device ggml_vk_current_device() {
+    if (!mgr.hasDevice())
+        return ggml_vk_device();
+
+    std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
+    ggml_vk_filterByName(devices, mgr.physicalDevice()->getProperties().deviceName);
+    return devices.front();
+}
+
+ggml_kompute_context *ggml_vk_init() {
+    return new ggml_kompute_context;
+}
+
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
+    return ctx->hasH2DAll;
+}
+
+void ggml_vk_free(struct ggml_kompute_context * ctx) {
+    delete ctx;
+}
+
+static
+void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) {
+    std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
+        vk::DescriptorPoolSize(
+          vk::DescriptorType::eStorageBuffer,
+          3 * size // Descriptor count is number of possible tensors to pass into an algorithm
+          )
+    };
+
+    vk::DescriptorPoolCreateInfo descriptorPoolInfo(
+      vk::DescriptorPoolCreateFlags(),
+      size, // Max sets
+      static_cast<uint32_t>(descriptorPoolSizes.size()),
+      descriptorPoolSizes.data());
+
+    ctx->pool = std::make_shared<vk::DescriptorPool>();
+    vk::Result r = mgr.device()->createDescriptorPool(
+      &descriptorPoolInfo, nullptr, ctx->pool.get());
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
+}
+
+static
+void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
+    if (ctx->pool) {
+        mgr.device()->destroy(
+          *ctx->pool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        ctx->pool = nullptr;
+    }
+}
+
+static
+vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
+    vk::BufferCreateInfo bufferCreateInfo;
+    bufferCreateInfo.size = size;
+    bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer |
+                             vk::BufferUsageFlagBits::eTransferSrc |
+                             vk::BufferUsageFlagBits::eTransferDst;
+    bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    vk::Buffer *vkBuffer = new vk::Buffer;
+    vk::Result r = mgr.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating buffer" << vk::to_string(r);
+    return vkBuffer;
+}
+
+static
+vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) {
+
+    uint32_t memoryTypeIndex = -1;
+    bool memoryTypeIndexFound = false;
+    vk::PhysicalDeviceMemoryProperties memoryProperties = mgr.physicalDevice()->getMemoryProperties();
+    for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
+        if (requirements.memoryTypeBits & (1 << i)) {
+            if (((memoryProperties.memoryTypes[i]).propertyFlags &
+                 flags) == flags) {
+                memoryTypeIndex = i;
+                memoryTypeIndexFound = true;
+                if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) {
+                    *isHostVisible = true;
+                }
+                break;
+            }
+        }
+    }
+    if (!memoryTypeIndexFound) {
+        throw std::runtime_error(
+          "Memory type index for buffer creation not found");
+    }
+
+    vk::MemoryAllocateInfo allocInfo;
+    allocInfo.allocationSize = size;
+    allocInfo.memoryTypeIndex = memoryTypeIndex;
+    vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
+    vk::Result r = mgr.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    if (r != vk::Result::eSuccess)
+        std::cerr << "Error allocating memory" << vk::to_string(r);
+    return vkDeviceMemory;
+}
+
+size_t ggml_vk_aligned_offset(size_t offset) {
+
+    static size_t minStorageBufferOffsetAlignment = 0;
+    if (minStorageBufferOffsetAlignment == 0) {
+        vk::PhysicalDeviceProperties deviceProperties;
+        deviceProperties = mgr.physicalDevice()->getProperties();
+        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
+        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
+    }
+
+    // If offset is already aligned, return it directly
+    if (offset % minStorageBufferOffsetAlignment == 0) {
+        return offset;
+    }
+
+    // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset
+    return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
+}
+
+static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
+    if (memory.stagingBuffer)
+        mgr.sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+}
+
+ggml_vk_memory ggml_vk_allocate(size_t size) {
+    ggml_vk_memory memory;
+    bool isHostVisible = false;
+    {
+        memory.primaryBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
+        memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        if (isHostVisible) {
+            vk::Result r = mgr.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            if (r != vk::Result::eSuccess)
+                std::cerr << "Error mapping memory" << vk::to_string(r);
+        }
+    }
+
+    if (!isHostVisible) {
+        memory.stagingBuffer = ggml_vk_allocate_buffer(size);
+        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
+                                                      vk::MemoryPropertyFlagBits::eHostCoherent |
+                                                      vk::MemoryPropertyFlagBits::eHostCached;
+        memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        mgr.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = mgr.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        if (r != vk::Result::eSuccess)
+            std::cerr << "Error mapping memory" << vk::to_string(r);
+    }
+
+    memory.size = size;
+    return memory;
+}
+
+void ggml_vk_free_memory(ggml_vk_memory &memory)
+{
+    mgr.device()->destroy(
+      *memory.primaryBuffer,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingBuffer) {
+        mgr.device()->destroy(
+          *memory.stagingBuffer,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+    mgr.device()->freeMemory(
+      *memory.primaryMemory,
+      (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    if (memory.stagingMemory) {
+        mgr.device()->freeMemory(
+          *memory.stagingMemory,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+    }
+}
+
+static
+decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    for (auto it = ctx->buffers.begin(); ; it++) {
+        if (it == ctx->buffers.end()) {
+            fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
+            return it;
+        }
+        if (it->data <= t->data &&
+                reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
+            offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
+            return it;
+        }
+    }
+}
+
+static
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
+    uint64_t originalOffset = 0;
+    auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    if (res == ctx->buffers.end()) {
+        static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+        return nullTensor;
+    }
+
+    // Create a tensor whose memory will be composed of our buffers at the correct offset
+    const size_t nelements = ggml_nelements(t);
+    size_t nbytes = ggml_nbytes(t);
+
+    size_t vulkanOffset = ggml_vk_aligned_offset(originalOffset);
+    if (alignedOffset) {
+        *alignedOffset = originalOffset - vulkanOffset;
+        nbytes += *alignedOffset;
+    }
+
+    return mgr.tensor(
+        t->data,
+        nelements,
+        nbytes, kp::Tensor::TensorDataTypes::eFloat,
+        res->primaryMemory, res->primaryBuffer,
+        res->stagingMemory, res->stagingBuffer,
+        vulkanOffset);
+}
+
+void ggml_vk_add_buffer(
+        struct ggml_kompute_context * ctx,
+        const char * /*name*/,
+        const ggml_vk_memory &memory) {
+    ctx->buffers.emplace_back(memory);
+}
+
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({res});
+}
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_h2d_buffer(it);
+    }
+    ctx->hasH2DAll = true;
+}
+
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx) {
+    for (auto& it : ctx->buffers) {
+        ggml_vk_d2h_buffer(it);
+    }
+}
+
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
+
+    GGML_ASSERT(res);
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({res});
+}
+
+std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
+    if (size % sizeof(uint32_t) != 0) {
+        throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
+    }
+
+    const uint32_t* data_ptr = reinterpret_cast<const uint32_t*>(rawData);
+    size_t count = size / sizeof(uint32_t);
+    return std::vector<uint32_t>(data_ptr, data_ptr + count);
+}
+
+inline static
+uint32_t safe_divide(uint32_t a, uint32_t b) {
+    if (b <= 1) {
+        return a;
+    }
+    if ((a % b) != 0) {
+        fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
+        GGML_ASSERT(!"safe_divide result would've had remainder");
+    }
+    return a / b;
+}
+
+void ggml_vk_add(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
+        kp::shader_data::op_add_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_addrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv,
+        kp::shader_data::op_addrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul(kp::Sequence& seq,
+                    const std::shared_ptr<kp::Tensor>& inA,
+                    const std::shared_ptr<kp::Tensor>& inB,
+                    const std::shared_ptr<kp::Tensor>& out,
+                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                    uint32_t size) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv,
+        kp::shader_data::op_mul_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mulrow(kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& inA,
+                 const std::shared_ptr<kp::Tensor>& inB,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                 uint32_t size, uint32_t row = 0) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mulrow_comp_spv,
+        kp::shader_data::op_mulrow_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        uint32_t row;
+    } const pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        row
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_scale(kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   uint32_t size, float scale) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_scale_comp_spv,
+        kp::shader_data::op_scale_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        float scale;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        scale
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t size) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+    } const pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_silu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
+        kp::shader_data::op_silu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_relu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
+        kp::shader_data::op_relu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_gelu(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
+        kp::shader_data::op_gelu_comp_spv_len);
+
+    ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_soft_max(kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& in,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inOff, uint32_t outOff,
+                      int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03) {
+
+    const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
+        kp::shader_data::op_softmax_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+                   const std::shared_ptr<kp::Tensor>& in,
+                   const std::shared_ptr<kp::Tensor>& out,
+                   uint32_t inOff, uint32_t outOff,
+                   int32_t ne00, int32_t nb01,
+                   int32_t nrows) {
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(ne00%sizeof(float) == 0);
+
+    const float epsilon = 1e-6f; // this is what ggml.c uses for rms norm
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t ne00, nb01;
+        float eps;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        (uint32_t)ne00, (uint32_t)nb01, epsilon
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({(uint32_t)nrows});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
+        kp::shader_data::op_norm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_rms_norm(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
+        kp::shader_data::op_rmsnorm_comp_spv_len);
+
+    ggml_vk_norm_(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+                           const std::shared_ptr<kp::Tensor>& in,
+                           const std::shared_ptr<kp::Tensor>& out,
+                           uint32_t inOff, uint32_t outOff,
+                           uint32_t n_past,
+                           int32_t ne00, int32_t ne01, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv,
+        kp::shader_data::op_diagmask_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t ne00, ne01;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past,
+        ne00, ne01
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         int32_t ne0, int32_t ne1) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv,
+        kp::shader_data::op_mul_mat_f16_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        int32_t ne0, ne1;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0,
+                          int32_t ne01, int32_t ne11) {
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0;
+    } pushConsts {
+        safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0,
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_mul_mat_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
+        kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+// FIXME: This could be improved like was done in q4_0 version but needs testing...
+template <typename... Args>
+void ggml_vk_mul_mat_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
+        kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
+
+    ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
+}
+
+void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
+                      unsigned element_size, unsigned qk,
+                      kp::Sequence& seq,
+                      const std::shared_ptr<kp::Tensor>& inA,
+                      const std::shared_ptr<kp::Tensor>& inB,
+                      const std::shared_ptr<kp::Tensor>& out,
+                      uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                      int32_t ne00, int32_t nb01, int32_t nb1,
+                      uint32_t size) {
+    GGML_ASSERT(nb01%element_size == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    if (qk) GGML_ASSERT(ne00%qk == 0);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, nb01, nb1;
+    } pushConsts {
+        safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb1
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({size});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
+        kp::shader_data::op_getrows_f16_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, sizeof(half), 0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
+        kp::shader_data::op_getrows_q4_0_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_0, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_get_rows_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
+        kp::shader_data::op_getrows_q4_1_comp_spv_len);
+
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
+}
+
+void ggml_vk_rope(kp::Sequence& seq,
+                  const std::shared_ptr<kp::Tensor>& in,
+                  const std::shared_ptr<kp::Tensor>& out,
+                  uint32_t inOff, uint32_t outOff,
+                  uint32_t n_past, int32_t n_dims, int32_t mode,
+                  float freq_base, float freq_scale,
+                  int32_t ne01, int32_t ne02, int32_t ne03,
+                  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                  int32_t ne0,
+                  uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_rope_comp_spv,
+        kp::shader_data::op_rope_comp_spv_len);
+
+    GGML_ASSERT(nb03%sizeof(float) == 0);
+    GGML_ASSERT(nb02%sizeof(float) == 0);
+    GGML_ASSERT(nb01%sizeof(float) == 0);
+    GGML_ASSERT(nb00%sizeof(float) == 0);
+    GGML_ASSERT(nb3%sizeof(float) == 0);
+    GGML_ASSERT(nb2%sizeof(float) == 0);
+    GGML_ASSERT(nb1%sizeof(float) == 0);
+    GGML_ASSERT(nb0%sizeof(float) == 0);
+
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        uint32_t n_past;
+        int32_t n_dims, mode;
+        float freq_base, freq_scale;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, 4), safe_divide(outOff, 4),
+        n_past, n_dims, mode,
+        freq_base, freq_scale,
+        nb00, nb01, nb02, nb03,
+        ne0,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template<uint32_t in_element_size, uint32_t out_element_size>
+void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
+                 kp::Sequence& seq,
+                 const std::shared_ptr<kp::Tensor>& in,
+                 const std::shared_ptr<kp::Tensor>& out,
+                 uint32_t inOff, uint32_t outOff,
+                 int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+                 uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+                 int32_t ne0, int32_t ne1, int32_t ne2,
+                 uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
+    struct PushConstants {
+        uint32_t inOff, outOff;
+        int32_t ne00, ne01, ne02;
+        uint32_t nb00, nb01, nb02, nb03;
+        int32_t ne0, ne1, ne2;
+        uint32_t nb0, nb1, nb2, nb3;
+    } pushConsts {
+        safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size),
+        ne00, ne01, ne02,
+        nb00, nb01, nb02, nb03,
+        ne0, ne1, ne2,
+        nb0, nb1, nb2, nb3
+    };
+
+    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!s_algo)
+        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    else {
+        s_algo->setTensors({in, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
+        kp::shader_data::op_cpy_f32_f16_comp_spv_len);
+    ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f32_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
+        kp::shader_data::op_cpy_f32_f32_comp_spv_len);
+    ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f16(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
+        kp::shader_data::op_cpy_f16_f16_comp_spv_len);
+    ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_cpy_f16_f32(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
+        kp::shader_data::op_cpy_f16_f32_comp_spv_len);
+    ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
+}
+
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
+    const int n_seq = 8;
+
+    // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting
+    // it to the size of the graph, but I think it can be made smaller?
+    ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes);
+
+    std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
+
+    for (auto& sequence : sequences) {
+        sequence = mgr.sequence();
+    }
+    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
+        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
+
+        auto& seq = *sequences[seq_idx];
+
+        const int node_start = (seq_idx + 0) * n_nodes_per_seq;
+        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+
+        for (int i = node_start; i < node_end; ++i) {
+            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * dst = gf->nodes[i];
+            GGML_ASSERT(dst->data != nullptr);
+
+            const int32_t ne00 = src0 ? src0->ne[0] : 0;
+            const int32_t ne01 = src0 ? src0->ne[1] : 0;
+            const int32_t ne02 = src0 ? src0->ne[2] : 0;
+            const int32_t ne03 = src0 ? src0->ne[3] : 0;
+
+            const uint32_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint32_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint32_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint32_t nb03 = src0 ? src0->nb[3] : 0;
+
+            const int32_t ne10 = src1 ? src1->ne[0] : 0;
+            const int32_t ne11 = src1 ? src1->ne[1] : 0;
+            const int32_t ne12 = src1 ? src1->ne[2] : 0;
+//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+
+//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint32_t nb12 = src1 ? src1->nb[2] : 0;
+//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+
+            const int32_t ne0 = dst ? dst->ne[0] : 0;
+            const int32_t ne1 = dst ? dst->ne[1] : 0;
+            const int32_t ne2 = dst ? dst->ne[2] : 0;
+//            const int32_t ne3 = dst ? dst->ne[3] : 0;
+
+            const uint32_t nb0 = dst ? dst->nb[0] : 0;
+            const uint32_t nb1 = dst ? dst->nb[1] : 0;
+            const uint32_t nb2 = dst ? dst->nb[2] : 0;
+            const uint32_t nb3 = dst ? dst->nb[3] : 0;
+
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
+
+            const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
+            uint32_t off_src0 = 0;
+            uint32_t off_src1 = 0;
+            uint32_t off_dst = 0;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
+
+            switch (dst->op) {
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop
+                    } break;
+                case GGML_OP_ADD:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_MUL:
+                    {
+                        if (ggml_nelements(src1) == ne10) {
+                            // src1 is a row
+                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                        } else {
+                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst));
+                        }
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        const float scale = *(const float *) src1->data;
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                    } break;
+                case GGML_OP_UNARY:
+                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        case GGML_UNARY_OP_GELU:
+                            {
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                            } break;
+                        default:
+                            {
+                                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                GGML_ASSERT(false);
+                            }
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        ggml_vk_soft_max(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03);
+                    } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        const int n_past = ((int32_t *)(dst->op_params))[0];
+                        ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02);
+                    } break;
+                case GGML_OP_NORM:
+                    {
+                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
+                            && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                        } else if (src0->type == GGML_TYPE_Q4_0
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else if (src0->type == GGML_TYPE_Q4_1
+                                   && src1->type == GGML_TYPE_F32) {
+                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_GET_ROWS:
+                    {
+                        if (src0->type == GGML_TYPE_F16) {
+                            ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                            ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                            ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            goto not_implemented;
+                        }
+                    } break;
+                case GGML_OP_ROPE:
+                    {
+                        const int n_past = ((int32_t *) dst->op_params)[0];
+                        const int n_dims = ((int32_t *) dst->op_params)[1];
+                        const int mode   = ((int32_t *) dst->op_params)[2];
+                        float freq_base;
+                        float freq_scale;
+                        memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                        memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+                        ggml_vk_rope(seq, id_src0, id_dst, off_src0, off_dst, n_past, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                    } break;
+                case GGML_OP_DUP:
+                case GGML_OP_CPY:
+                case GGML_OP_CONT:
+                    {
+                        switch (src0t) {
+                            case GGML_TYPE_F32:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        default: goto not_implemented;
+                                    }
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                        case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break;
+                                    default: goto not_implemented;
+                                } break;
+                            default: goto not_implemented;
+                            }
+                        }
+                    } break;
+                default: goto not_implemented;
+            }
+            continue;
+            not_implemented: {}
+            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+            //GGML_ASSERT(false);
+        }
+
+        // Evaluate sequence
+        seq.evalAsync();
+    }
+
+    // Wait for all sequences to finish
+    for (auto& sequence : sequences) {
+        if (sequence->isRunning())
+            sequence->evalAwait();
+    }
+
+    ggml_vk_free_descriptor_pool(ctx);
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<half>::dataType()
+{
+    return TensorDataTypes::eFloat;
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<uint8_t>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
new file mode 100644
index 000000000..ad8b41e4d
--- /dev/null
+++ b/ggml-vulkan.h
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include <string>
+
+struct ggml_kompute_context;
+
+namespace vk {
+    class DeviceMemory;
+    class Buffer;
+};
+
+struct ggml_vk_memory {
+    void *data = nullptr;
+    size_t size = 0;
+    vk::DeviceMemory *primaryMemory = nullptr;
+    vk::Buffer *primaryBuffer = nullptr;
+    vk::DeviceMemory *stagingMemory = nullptr;
+    vk::Buffer *stagingBuffer = nullptr;
+};
+
+struct ggml_vk_device {
+    int index = 0;
+    int type = 0;           // same as VkPhysicalDeviceType
+    size_t heapSize = 0;
+    std::string name;
+    std::string vendor;
+};
+
+std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
+bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
+bool ggml_vk_init_device(const ggml_vk_device &device);
+bool ggml_vk_init_device(int device);
+bool ggml_vk_has_device();
+ggml_vk_device ggml_vk_current_device();
+struct ggml_kompute_context * ggml_vk_init(void);
+bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_free(struct ggml_kompute_context * ctx);
+size_t ggml_vk_aligned_offset(size_t offset);
+ggml_vk_memory ggml_vk_allocate(size_t size);
+void ggml_vk_free_memory(ggml_vk_memory &memory);
+
+void ggml_vk_add_buffer(
+    struct ggml_kompute_context * ctx,
+    const char * name,
+    const ggml_vk_memory &memory);
+
+void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
+void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
diff --git a/ggml.c b/ggml.c
index a0be068d6..cf9e056ba 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9007,7 +9007,7 @@ static void ggml_compute_forward_add_q_f32(
     }
 }
 
-static void ggml_compute_forward_add(
+void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -9587,7 +9587,7 @@ static void ggml_compute_forward_mul_f32(
     }
 }
 
-static void ggml_compute_forward_mul(
+void ggml_compute_forward_mul(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10510,7 +10510,7 @@ static void ggml_compute_forward_elu(
 
 // ggml_compute_forward_relu
 
-static void ggml_compute_forward_relu_f32(
+void ggml_compute_forward_relu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10534,7 +10534,7 @@ static void ggml_compute_forward_relu_f32(
     }
 }
 
-static void ggml_compute_forward_relu(
+void ggml_compute_forward_relu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10552,7 +10552,7 @@ static void ggml_compute_forward_relu(
 
 // ggml_compute_forward_gelu
 
-static void ggml_compute_forward_gelu_f32(
+void ggml_compute_forward_gelu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10593,7 +10593,7 @@ static void ggml_compute_forward_gelu_f32(
     }
 }
 
-static void ggml_compute_forward_gelu(
+void ggml_compute_forward_gelu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10670,7 +10670,7 @@ static void ggml_compute_forward_gelu_quick(
 
 // ggml_compute_forward_silu
 
-static void ggml_compute_forward_silu_f32(
+void ggml_compute_forward_silu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10711,7 +10711,7 @@ static void ggml_compute_forward_silu_f32(
     }
 }
 
-static void ggml_compute_forward_silu(
+void ggml_compute_forward_silu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10844,7 +10844,7 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-static void ggml_compute_forward_norm(
+void ggml_compute_forward_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10910,7 +10910,7 @@ static void ggml_compute_forward_rms_norm_f32(
     }
 }
 
-static void ggml_compute_forward_rms_norm(
+void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11623,7 +11623,7 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
-static void ggml_compute_forward_scale(
+void ggml_compute_forward_scale(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -11744,7 +11744,7 @@ static void ggml_compute_forward_set(
 
 // ggml_compute_forward_cpy
 
-static void ggml_compute_forward_cpy(
+void ggml_compute_forward_cpy(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -11888,7 +11888,7 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
-static void ggml_compute_forward_get_rows(
+void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -12164,7 +12164,7 @@ static void ggml_compute_forward_diag_mask_f32(
     }
 }
 
-static void ggml_compute_forward_diag_mask_inf(
+void ggml_compute_forward_diag_mask_inf(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12198,7 +12198,7 @@ static void ggml_compute_forward_diag_mask_zero(
 
 // ggml_compute_forward_soft_max
 
-static void ggml_compute_forward_soft_max_f32(
+void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -12887,7 +12887,7 @@ static void ggml_compute_forward_rope_f16(
     }
 }
 
-static void ggml_compute_forward_rope(
+void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
diff --git a/kompute/.ccls b/kompute/.ccls
new file mode 100644
index 000000000..71d5d711e
--- /dev/null
+++ b/kompute/.ccls
@@ -0,0 +1,27 @@
+
+%clang
+
+-fdeclspec
+-fms-extensions
+-Wall
+-Wextra
+-std=c++17
+
+%h -x
+%h c++-header
+
+-DDEBUG=1
+-DKOMPUTE_INCLUDE_FOR_SYNTAX
+
+-I/usr/include/python3.6/
+-I./python/pybind11/include/
+
+-I./build/_deps/vulkan_header-src/include/
+-I./build/_deps/spdlog-src/include/
+-I./build/_deps/googletest-src/googletest/include/
+-I./build/_deps/fmt-src/include/
+
+-I./src/include/
+-I./build/src/shaders/glsl/
+-I./build/test/shaders/glsl/
+-I./test/utils/
diff --git a/kompute/.clang-format b/kompute/.clang-format
new file mode 100644
index 000000000..5191313a3
--- /dev/null
+++ b/kompute/.clang-format
@@ -0,0 +1,5 @@
+﻿---
+BasedOnStyle: Mozilla
+IndentWidth: 4
+
+...
diff --git a/kompute/.dockerignore b/kompute/.dockerignore
new file mode 100644
index 000000000..9498d9195
--- /dev/null
+++ b/kompute/.dockerignore
@@ -0,0 +1,4 @@
+build/*
+examples/*
+docker-builders/
+swiftshader/
diff --git a/kompute/.github/workflows/cpp_examples.yml b/kompute/.github/workflows/cpp_examples.yml
new file mode 100644
index 000000000..ad5306e9b
--- /dev/null
+++ b/kompute/.github/workflows/cpp_examples.yml
@@ -0,0 +1,58 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  array-multiplication-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/array_multiplication/build
+        source-dir: ${{github.workspace}}/examples/array_multiplication
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/array_multiplication/build/src/kompute_array_mult
+
+  logistc-regression-example:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/examples/logistic_regression/build
+        source-dir: ${{github.workspace}}/examples/logistic_regression
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
+        build-options: --parallel # Given we don't build too many resources we can leverage parallel
+    - name: Run tests
+      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
diff --git a/kompute/.github/workflows/cpp_tests.yml b/kompute/.github/workflows/cpp_tests.yml
new file mode 100644
index 000000000..53a90a145
--- /dev/null
+++ b/kompute/.github/workflows/cpp_tests.yml
@@ -0,0 +1,104 @@
+name: C++ Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  cpp-tests-debug-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-release-with-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+
+  cpp-tests-debug-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Debug
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
+  
+  cpp-tests-release-without-debug-layers:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    env:
+      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: "[Release g++] Build & Test"
+      uses: KomputeProject/action-cmake-build@master
+      with:
+        build-dir: ${{github.workspace}}/build
+        source-dir: ${{github.workspace}}
+        cc: gcc
+        cxx: g++
+        build-type: Release
+        run-test: false
+        ctest-options: -V
+        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
+    - name: Run tests
+      run: make mk_run_tests
diff --git a/kompute/.github/workflows/python_tests.yml b/kompute/.github/workflows/python_tests.yml
new file mode 100644
index 000000000..9f84d1e85
--- /dev/null
+++ b/kompute/.github/workflows/python_tests.yml
@@ -0,0 +1,28 @@
+name: Python Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  python-tests:
+    runs-on: ubuntu-latest
+    container: axsauze/kompute-builder:0.4
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+      with:
+        submodules: false
+    - name: Install Python Requirements
+      run: pip3 install --user -r python/test/requirements-dev.txt
+    - name: Python Build
+      env:
+        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
+        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
+      run: pip3 install --user . -v
+    - name: Python run Tests
+      run: |
+        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
+        make test_python
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
new file mode 100644
index 000000000..f89e13d1d
--- /dev/null
+++ b/kompute/CMakeLists.txt
@@ -0,0 +1,187 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+project(kompute VERSION 0.8.1 LANGUAGES CXX)
+
+set(CMAKE_CXX_STANDARD 14)
+
+# Only change the folder behavior if kompute is not a subproject
+if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
+    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
+    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
+    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
+endif()
+
+# Avoid the dll boilerplate code for windows
+set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
+
+set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
+
+# ####################################################
+# Options
+# ####################################################
+macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow overriding the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    # Allow disabling logging completely and prevent linking against it:
+    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+        set(${OPTION_NAME}_DISABLED ON)
+        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
+    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
+
+    if(DEFINED ENV{${OPTION_NAME}})
+        # Allow setting the option through an environment variable
+        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
+    endif()
+
+    if(${OPTION_NAME})
+        add_definitions(-D${OPTION_NAME})
+    endif()
+
+    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
+endmacro()
+
+message(STATUS "General purpose GPU compute framework built on Vulkan")
+message(STATUS "=======================================================")
+
+# Build options
+kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
+kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
+kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
+kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
+kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
+
+# External components
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
+kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
+kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
+kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
+message(STATUS "=======================================================")
+
+# ####################################################
+# Deprecated Options
+# ####################################################
+include(cmake/deprecation_warnings.cmake)
+
+# ####################################################
+# Dependencies
+# ####################################################
+include(cmake/vulkan_shader_compiler.cmake)
+include(cmake/check_vulkan_version.cmake)
+include(FetchContent)
+
+# Vulkan Header
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
+        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
+    FetchContent_MakeAvailable(vulkan_header)
+
+    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+        # Ensure the driver supports this Vulkan version
+        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
+    endif()
+endif()
+
+find_package(Vulkan REQUIRED)
+
+if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
+    add_library(Vulkan::Headers INTERFACE IMPORTED)
+    set_target_properties(Vulkan::Headers PROPERTIES
+        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
+endif()
+
+if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
+    # Ensure the driver supports this Vulkan version
+    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
+endif()
+
+# Spdlog
+if(KOMPUTE_OPT_USE_SPDLOG)
+    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
+
+    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
+            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
+
+            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
+                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
+            FetchContent_MakeAvailable(spdlog)
+        else()
+            find_package(spdlog REQUIRED)
+        endif()
+    endif()
+endif()
+
+# fmt
+if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
+    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
+        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
+    FetchContent_MakeAvailable(fmt)
+else()
+    find_package(fmt REQUIRED)
+endif()
+
+# ####################################################
+# Preprocessor Macros
+# ####################################################
+if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
+    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
+endif()
+
+if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+endif()
+
+# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
+# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    add_definitions(-DUSE_EXTERNAL_GLSLANG)
+endif()
+
+# Allow scripts to call main kompute Makefile
+function(kompute_make KOMPUTE_MAKE_TARGET)
+    add_custom_target(${KOMPUTE_MAKE_TARGET}
+        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
+endfunction()
+
+add_executable(xxd external/bin/xxd.c)
+
+add_subdirectory(src)
diff --git a/kompute/LICENSE b/kompute/LICENSE
new file mode 100644
index 000000000..821a2723e
--- /dev/null
+++ b/kompute/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2021 The Institute for Ethical AI & Machine Learning
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/kompute/Makefile b/kompute/Makefile
new file mode 100644
index 000000000..62ad68b46
--- /dev/null
+++ b/kompute/Makefile
@@ -0,0 +1,210 @@
+# This makefile is optimized to be run from WSL and to interact with the 
+# Windows host as there are limitations when building GPU programs. This
+# makefile contains the commands for interacting with the visual studio
+# build via command line for faster iterations, as the intention is to 
+# support other editors (optimised for vim). There are also commands that
+# support the builds for linux-native compilations and these are the commands
+# starting with mk_.
+
+VERSION := $(shell cat ./VERSION)
+
+VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
+VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
+
+# These are the tests that don't work with swiftshader but can be run directly with vulkan
+FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
+
+ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
+	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
+	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
+	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
+else
+	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
+	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
+	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
+	# Choosing the binary based on whether it's on WSL or linux-native
+	KERNEL := $(shell uname -r)
+	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
+	ifeq ($(IS_WSL),0)
+		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
+	else
+		SCMP_BIN ?= "/usr/bin/glslangValidator"
+	endif
+endif
+
+
+####### Main Target Rules #######
+
+push_docs_to_ghpages:
+	GIT_DEPLOY_DIR="build/docs/sphinx/" \
+		GIT_DEPLOY_BRANCH="gh-pages" \
+		GIT_DEPLOY_REPO="origin" \
+			./scripts/push_folder_to_branch.sh
+
+####### CMAKE quickstart commands #######
+
+clean_cmake:
+	rm -rf build/
+
+####### Visual studio build shortcut commands #######
+
+MK_BUILD_TYPE ?= "Release"
+MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+MK_CMAKE_EXTRA_FLAGS ?= ""
+MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+
+mk_cmake:
+	cmake \
+		-Bbuild \
+		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
+		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_DOCS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
+		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
+		$(MK_CMAKE_EXTRA_FLAGS) \
+		-G "Unix Makefiles"
+
+mk_build_all:
+	cmake --build build/. --parallel
+
+mk_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+mk_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+mk_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+mk_run_docs: mk_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+# An alternative would be: ctest -vv --test-dir build/.
+# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
+# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
+mk_run_tests: mk_build_tests
+	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
+
+mk_build_swiftshader_library:
+	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
+	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
+	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
+	cmake --build swiftshader/build/. --parallel
+
+mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
+mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
+
+
+####### Visual studio build shortcut commands #######
+
+VS_BUILD_TYPE ?= "Debug"
+# Run with multiprocessin / parallel build by default
+VS_CMAKE_EXTRA_FLAGS ?= ""
+VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
+VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
+
+vs_cmake:
+	$(CMAKE_BIN) \
+		-Bbuild \
+		$(VS_CMAKE_EXTRA_FLAGS) \
+		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
+		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
+		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
+		-DKOMPUTE_OPT_INSTALL=ON \
+		-DKOMPUTE_OPT_BUILD_TESTS=ON \
+		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
+		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
+		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
+		-G "Visual Studio 16 2019" \
+		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
+
+vs_build_all:
+	cmake --build build/. --parallel
+
+vs_build_docs:
+	cmake --build build/. --target gendocsall --parallel
+
+vs_install_kompute:
+	cmake --build build/. --target install --parallel
+
+vs_build_kompute:
+	cmake --build build/. --target kompute --parallel
+
+vs_build_tests:
+	cmake --build build/. --target kompute_tests --parallel
+
+vs_run_docs: vs_build_docs
+	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
+
+vs_run_tests: vs_build_tests
+	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
+
+
+#### PYTHONG ####
+
+test_python:
+	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
+
+####### Run CI Commands #######
+
+# This command uses act to replicate github action
+# https://github.com/nektos/act
+run_ci:
+	act
+
+####### General project commands #######
+
+generate_python_docstrings:
+	python -m pybind11_mkdoc \
+		-o python/src/docstrings.hpp \
+		kompute/Kompute.hpp \
+		-Iexternal/fmt/include/ \
+		-Iexternal/spdlog/include/ \
+		-Iexternal/glslang/ \
+		-I/usr/include/c++/7.5.0/
+
+install_python_reqs:
+	python3 -m pip install -r scripts/requirements.txt
+
+install_lcov:
+	sudo apt install lcov -y
+
+build_shaders:
+	python3 scripts/convert_shaders.py \
+		--shader-path shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path src/include/kompute/shaders/ \
+		-v
+	python3 scripts/convert_shaders.py \
+		--shader-path test/shaders/glsl \
+		--shader-binary $(SCMP_BIN) \
+		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
+		-v
+
+build_single_header:
+	quom \
+		--include_directory \
+		"src/include/" \
+		"single_include/AggregateHeaders.cpp" \
+		"single_include/kompute/Kompute.hpp"
+
+win_build_xxd:
+	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
+
+format:
+	for val in "examples single_include src test" ; do \
+    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
+	done
+
+static_scan:
+	cppcheck --project=build/compile_commands.json -iexternal/
+
+build_changelog:
+	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
+	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
+	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
diff --git a/kompute/README.md b/kompute/README.md
new file mode 100644
index 000000000..b169da254
--- /dev/null
+++ b/kompute/README.md
@@ -0,0 +1,513 @@
+
+![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
+![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
+![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
+![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
+![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
+[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
+
+<table>
+<tr>
+
+<td width="20%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
+</td>
+
+<td>
+
+<h1>Kompute</h1>
+<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
+
+</td>
+
+</tr>
+</table>
+
+<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
+
+💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
+
+<hr>
+
+##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
+
+<table>
+<tr>
+<td>
+<a href="https://www.linuxfoundation.org/projects/">
+<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
+</a>
+</td>
+<td>
+<a href="https://lfaidata.foundation/projects/">
+<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
+</a>
+</td>
+</tr>
+</table>
+
+
+## Principles & Features
+
+* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
+* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
+* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
+* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
+* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
+* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
+* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
+* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
+
+## Getting Started
+
+Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
+
+You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
+
+### Your First Kompute (C++)
+
+The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
+
+```c++
+
+void kompute(const std::string& shader) {
+
+    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    kp::Manager mgr; 
+
+    // 2. Create and initialise Kompute Tensors through manager
+
+    // Default tensor constructor simplifies creation of float values
+    auto tensorInA = mgr.tensor({ 2., 2., 2. });
+    auto tensorInB = mgr.tensor({ 1., 2., 3. });
+    // Explicit type constructor supports uint32, int32, double, float and bool
+    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
+
+    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
+
+    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    kp::Workgroup workgroup({3, 1, 1});
+    std::vector<float> specConsts({ 2 });
+    std::vector<float> pushConstsA({ 2.0 });
+    std::vector<float> pushConstsB({ 3.0 });
+
+    auto algorithm = mgr.algorithm(params,
+                                   // See documentation shader section for compileSource
+                                   compileSource(shader),
+                                   workgroup,
+                                   specConsts,
+                                   pushConstsA);
+
+    // 4. Run operation synchronously using sequence
+    mgr.sequence()
+        ->record<kp::OpTensorSyncDevice>(params)
+        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
+        ->eval() // Evaluates the two recorded operations
+        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
+        ->eval(); // Evaluates only last recorded operation
+
+    // 5. Sync results from the GPU asynchronously
+    auto sq = mgr.sequence();
+    sq->evalAsync<kp::OpTensorSyncLocal>(params);
+
+    // ... Do other work asynchronously whilst GPU finishes
+
+    sq->evalAwait();
+
+    // Prints the first output which is: { 4, 8, 12 }
+    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
+    // Prints the second output which is: { 10, 10, 10 }
+    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
+
+} // Manages / releases all CPU and GPU memory resources
+
+int main() {
+
+    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    // files). This shader shows some of the main components including constants, buffers, etc
+    std::string shader = (R"(
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    )");
+
+    // Run the function declared above with our raw string shader
+    kompute(shader);
+}
+
+```
+
+### Your First Kompute (Python)
+
+The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
+
+```python
+
+from .utils import compile_source # using util function from python/test/utils
+
+def kompute(shader):
+    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
+    mgr = kp.Manager()
+
+    # 2. Create and initialise Kompute Tensors through manager
+
+    # Default tensor constructor simplifies creation of float values
+    tensor_in_a = mgr.tensor([2, 2, 2])
+    tensor_in_b = mgr.tensor([1, 2, 3])
+    # Explicit type constructor supports uint32, int32, double, float and bool
+    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
+
+    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
+
+    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
+    workgroup = (3, 1, 1)
+    spec_consts = [2]
+    push_consts_a = [2]
+    push_consts_b = [3]
+
+    # See documentation shader section for compile_source
+    spirv = compile_source(shader)
+
+    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
+
+    # 4. Run operation synchronously using sequence
+    (mgr.sequence()
+        .record(kp.OpTensorSyncDevice(params))
+        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
+        .eval() # evaluates the two recorded ops
+        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
+        .eval()) # evaluates only the last recorded op
+
+    # 5. Sync results from the GPU asynchronously
+    sq = mgr.sequence()
+    sq.eval_async(kp.OpTensorSyncLocal(params))
+
+    # ... Do other work asynchronously whilst GPU finishes
+
+    sq.eval_await()
+
+    # Prints the first output which is: { 4, 8, 12 }
+    print(tensor_out_a)
+    # Prints the first output which is: { 10, 10, 10 }
+    print(tensor_out_b)
+
+if __name__ == "__main__":
+
+    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
+    # files). This shader shows some of the main components including constants, buffers, etc
+    shader = """
+        #version 450
+
+        layout (local_size_x = 1) in;
+
+        // The input tensors bind index is relative to index in parameter passed
+        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
+        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
+        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
+        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
+
+        // Kompute supports push constants updated on dispatch
+        layout(push_constant) uniform PushConstants {
+            float val;
+        } push_const;
+
+        // Kompute also supports spec constants on initalization
+        layout(constant_id = 0) const float const_one = 0;
+
+        void main() {
+            uint index = gl_GlobalInvocationID.x;
+            out_a[index] += uint( in_a[index] * in_b[index] );
+            out_b[index] += uint( const_one * push_const.val );
+        }
+    """
+
+    kompute(shader)
+
+```
+
+### Interactive Notebooks & Hands on Videos
+
+You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
+</td>
+
+<td>
+<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
+</a>
+</td>
+
+<td>
+<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+You can also check out the two following talks presented at the FOSDEM 2021 conference. 
+
+Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
+
+<table>
+<tr>
+
+<td width="50%">
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
+</td>
+
+<td>
+<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
+</td>
+
+</tr>
+<tr>
+
+<td width="50%">
+<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
+</a>
+</td>
+
+<td>
+<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
+</a>
+</td>
+
+</tr>
+</table>
+
+
+## Architectural Overview
+
+The core architecture of Kompute includes the following:
+* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
+* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
+* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
+* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
+* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
+
+To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
+
+<table>
+<th>
+Full Architecture
+</th>
+<th>
+Simplified Kompute Components
+</th>
+<tr>
+<td width=30%>
+
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
+
+<br>
+<br>
+(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
+<br>
+<br>
+
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
+
+</td>
+<td>
+<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
+</td>
+</tr>
+</table>
+
+
+## Asynchronous and Parallel Operations
+
+Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
+
+The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
+
+![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
+
+## Mobile Enabled
+
+Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
+
+<table>
+<tr>
+
+<td width="70%">
+<p>
+For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
+
+You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
+
+</p>
+
+
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
+
+</td>
+
+
+<td width="30%">
+<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
+</td>
+
+</tr>
+</table>
+
+## More examples
+
+### Simple examples
+
+* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
+* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
+* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
+* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
+* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
+* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
+
+### End-to-end examples
+
+* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
+* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
+* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
+* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
+
+## Python Package
+
+Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
+
+The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
+
+```
+pip install kp
+```
+
+You can also install from master branch using:
+
+```
+pip install git+git://github.com/KomputeProject/kompute.git@master
+```
+
+For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
+
+## C++ Build Overview
+
+The build system provided uses `cmake`, which allows for cross platform builds.
+
+The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
+
+```
+   cmake -Bbuild
+```
+
+You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
+
+For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
+
+## Kompute Development
+
+We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
+
+### Contributing
+
+#### Dev Dependencies
+
+* Testing
+    + GTest
+* Documentation
+    + Doxygen (with Dot)
+    + Sphynx
+
+#### Development
+
+* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
+    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
+    + All dependencies are defined in vcpkg.json 
+* Uses cmake as build system, and provides a top level makefile with recommended command
+* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
+* Uses doxygen and sphinx for documentation and autodocs
+* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
+
+If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
+
+```
+export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
+```
+
+##### Updating documentation
+
+To update the documentation you will need to:
+* Run the gendoxygen target in the build system
+* Run the gensphynx target in the build-system 
+* Push to github pages with `make push_docs_to_ghpages`
+
+##### Running tests
+
+Running the unit tests has been significantly simplified for contributors.
+
+The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
+
+```
+$ act
+
+[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
+[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
+...
+```
+
+The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
+
+The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
+
+In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
+
+For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
+
+## Motivations
+
+This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
+
+The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
+
+We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
diff --git a/kompute/cmake/bin2h.cmake b/kompute/cmake/bin2h.cmake
new file mode 100644
index 000000000..21ad56cb1
--- /dev/null
+++ b/kompute/cmake/bin2h.cmake
@@ -0,0 +1,106 @@
+##################################################################################
+# Based on: https://github.com/sivachandran/cmake-bin2h
+#
+# Copyright 2020 Sivachandran Paramasivam
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+##################################################################################
+
+include(CMakeParseArguments)
+
+# Function to wrap a given string into multiple lines at the given column position.
+# Parameters:
+#   VARIABLE    - The name of the CMake variable holding the string.
+#   AT_COLUMN   - The column position at which string will be wrapped.
+function(WRAP_STRING)
+    set(oneValueArgs VARIABLE AT_COLUMN)
+    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
+    math(EXPR offset "0")
+
+    while(stringLength GREATER 0)
+
+        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
+            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
+        else()
+            math(EXPR length "${stringLength}")
+        endif()
+
+        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
+        set(lines "${lines}\n${line}")
+
+        math(EXPR stringLength "${stringLength} - ${length}")
+        math(EXPR offset "${offset} + ${length}")
+    endwhile()
+
+    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
+endfunction()
+
+# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
+# will contain a byte array and integer variable holding the size of the array.
+# Parameters
+#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
+#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
+#                      to this name and will be used a variable name for size variable.
+#   HEADER_FILE      - The path of header file.
+#   APPEND           - If specified appends to the header file instead of overwriting it
+#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
+#                      useful if the source file is a text file and we want to use the file contents
+#                      as string. But the size variable holds size of the byte array without this
+#                      null byte.
+#   HEADER_NAMESPACE - The namespace, where the array should be located in.
+#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
+#                      big endian system architecture
+# Usage:
+#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
+function(BIN2H)
+    set(options APPEND NULL_TERMINATE)
+    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
+    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    # reads source file contents as hex string
+    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
+    string(LENGTH ${hexString} hexStringLength)
+
+    # appends null byte if asked
+    if(BIN2H_NULL_TERMINATE)
+        set(hexString "${hexString}00")
+    endif()
+
+    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
+    wrap_string(VARIABLE hexString AT_COLUMN 32)
+    math(EXPR arraySize "${hexStringLength} / 8")
+
+    # adds '0x' prefix and comma suffix before and after every byte respectively
+    if(IS_BIG_ENDIAN)
+        message(STATUS "Interpreting shader in big endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
+    else()
+        message(STATUS "Interpreting shader in little endian...")
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
+    endif()
+    # removes trailing comma
+    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
+
+    # converts the variable name into proper C identifier
+    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
+
+    # declares byte array and the length variables
+    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
+    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
+    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
+    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
+
+    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
+    if(BIN2H_APPEND)
+        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
+    else()
+        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
+    endif()
+endfunction()
\ No newline at end of file
diff --git a/kompute/cmake/bin_file_to_header.cmake b/kompute/cmake/bin_file_to_header.cmake
new file mode 100644
index 000000000..b47b36139
--- /dev/null
+++ b/kompute/cmake/bin_file_to_header.cmake
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.20)
+
+if(${INPUT_SHADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
+endif()
+
+if(${OUTPUT_HEADER_FILE} STREQUAL "")
+    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
+endif()
+
+if(${HEADER_NAMESPACE} STREQUAL "")
+    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
+endif()
+
+include(bin2h.cmake)
+
+get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
+bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
+file(APPEND ${OUTPUT_HEADER_FILE} "\n")
\ No newline at end of file
diff --git a/kompute/cmake/check_vulkan_version.cmake b/kompute/cmake/check_vulkan_version.cmake
new file mode 100644
index 000000000..0372d3206
--- /dev/null
+++ b/kompute/cmake/check_vulkan_version.cmake
@@ -0,0 +1,139 @@
+# Current issue: Only checks the result of GPU0
+function(check_vulkan_version)
+    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
+    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
+
+    # Get the current Vulkan Header version (e.g. 1.2.189).
+    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
+    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
+        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
+        if(EXISTS ${VULKAN_CORE_H})
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
+            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
+            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
+            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
+                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
+                list(LENGTH VULKAN_HEADER_VERSION2 _len)
+                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
+                if(_len EQUAL 3)
+                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
+                endif()
+                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
+                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
+            else()
+                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
+                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
+                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
+                else()
+                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
+                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
+                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
+                    else()
+                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
+                    endif()
+                endif()
+            endif()
+        else()
+            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+            return()
+        endif()
+    else()
+        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
+        return()
+    endif()
+    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
+
+    # Get Vulkan version supported by driver
+    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
+    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
+        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+        return()
+    endif()
+
+    execute_process(COMMAND "vulkaninfo"
+                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
+                    RESULT_VARIABLE VULKAN_INFO_RETURN)
+    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
+        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
+        return()
+    else()
+        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
+    endif()
+
+    # Check if running vulkaninfo was successfully
+    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
+    if(VULKAN_INFO_SUCCESSFUL LESS 0)
+        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
+    endif()
+
+    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
+    if(NOT GPU_IDS)
+        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
+    if(NOT GPU_API_VERSIONS)
+        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
+    endif()
+
+    # Check length
+    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
+    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
+    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
+    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
+        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
+    endif()
+
+    # Compare versions
+    set(VALID_GPU "")
+    set(VALID_VULKAN_VERSION "")
+    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
+    foreach(INDEX RANGE ${ITER_LEN})
+        list(GET GPU_IDS ${INDEX} GPU)
+        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
+
+        # Extract API version
+        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
+            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
+        else()
+            message(FATAL_ERROR "API version match failed. This should not have happened...")
+        endif()
+
+        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
+
+        # Compare driver and header version
+        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
+        # Version missmatch. Let us check if the minor version is the same.
+            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
+                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
+            else()
+                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
+            endif()
+
+            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
+                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
+                set(VALID_GPU ${GPU})
+                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+                break()
+            else()
+                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
+            endif()
+        else()
+            set(VALID_GPU ${GPU})
+            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
+            break()
+        endif()
+    endforeach()
+
+    if("${VALID_GPU}" STREQUAL "")
+        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
+    else()
+        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
+    endif()
+
+endfunction()
diff --git a/kompute/cmake/code_coverage.cmake b/kompute/cmake/code_coverage.cmake
new file mode 100644
index 000000000..7fb6ce264
--- /dev/null
+++ b/kompute/cmake/code_coverage.cmake
@@ -0,0 +1,35 @@
+# Code coverage
+set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
+message(STATUS "Enabling gcov support")
+
+if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(COVERAGE_FLAG "--coverage")
+endif()
+
+set(CMAKE_CXX_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C++ compiler during coverage builds."
+    FORCE)
+set(CMAKE_C_FLAGS_COVERAGE
+    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
+    CACHE STRING "Flags used by the C compiler during coverage builds."
+    FORCE)
+set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used for linking binaries during coverage builds."
+    FORCE)
+set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
+    ""
+    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
+    FORCE)
+
+set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
+set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
+set(CODECOV_FILENAME_LCOV_INFO lcov.info)
+set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
+set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
+
+mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
+    CMAKE_C_FLAGS_COVERAGE
+    CMAKE_EXE_LINKER_FLAGS_COVERAGE
+    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
diff --git a/kompute/cmake/deprecation_warnings.cmake b/kompute/cmake/deprecation_warnings.cmake
new file mode 100644
index 000000000..1ed1f4555
--- /dev/null
+++ b/kompute/cmake/deprecation_warnings.cmake
@@ -0,0 +1,15 @@
+if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
+    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
+endif()
+
+if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
+endif()
+
+if(KOMPUTE_OPT_ENABLE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
+endif()
\ No newline at end of file
diff --git a/kompute/cmake/komputeConfig.cmake.in b/kompute/cmake/komputeConfig.cmake.in
new file mode 100644
index 000000000..87e8a99e2
--- /dev/null
+++ b/kompute/cmake/komputeConfig.cmake.in
@@ -0,0 +1,8 @@
+include(CMakeFindDependencyMacro)
+@PACKAGE_INIT@
+
+find_dependency(VULKAN REQUIRED)
+
+include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
+
+check_required_components(kompute)
\ No newline at end of file
diff --git a/kompute/cmake/vulkan_shader_compiler.cmake b/kompute/cmake/vulkan_shader_compiler.cmake
new file mode 100644
index 000000000..acc27b57c
--- /dev/null
+++ b/kompute/cmake/vulkan_shader_compiler.cmake
@@ -0,0 +1,43 @@
+function(vulkan_compile_shader)
+     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
+     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
+          message(FATAL_ERROR "glslangValidator not found.")
+          return()
+     endif()
+
+     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
+     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
+     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
+     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
+
+     if(NOT SHADER_COMPILE_RELATIVE_PATH)
+          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+     endif()
+    
+     # .comp -> .spv
+     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
+                        ARGS "-V"
+                             "${SHADER_COMPILE_INFILE_FULL}"
+                             "-o"
+                             "${SHADER_COMPILE_SPV_FILE_FULL}"
+                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
+
+     # Check if big or little endian
+     include (TestBigEndian)
+     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
+
+     # .spv -> .hpp
+     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
+                        COMMAND ${CMAKE_COMMAND}
+                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
+                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
+                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
+                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
+                             "-P"
+                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
+                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
+                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
+                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
+endfunction()
diff --git a/kompute/config/FindSphinx.cmake b/kompute/config/FindSphinx.cmake
new file mode 100644
index 000000000..c645ccc9f
--- /dev/null
+++ b/kompute/config/FindSphinx.cmake
@@ -0,0 +1,16 @@
+# Look for an executable called sphinx-build
+find_program(SPHINX_EXECUTABLE
+    NAMES sphinx-build
+    DOC "Path to sphinx-build executable")
+
+if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
+    message(FATAL_ERROR "sphinx-build not found.")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Handle standard arguments to find_package like REQUIRED and QUIET
+find_package_handle_standard_args(
+    Sphinx
+    "Failed to find sphinx-build executable"
+    SPHINX_EXECUTABLE)
diff --git a/kompute/external/bin/xxd.c b/kompute/external/bin/xxd.c
new file mode 100644
index 000000000..60ed3f712
--- /dev/null
+++ b/kompute/external/bin/xxd.c
@@ -0,0 +1,819 @@
+/*
+As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
+the author has permitted redistribution of xxd under the MIT license, as follows:
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * xxd: my hexdump facility. jw
+ *
+ *  2.10.90 changed to word output
+ *  3.03.93 new indent style, dumb bug inserted and fixed.
+ *	    -c option, mls
+ * 26.04.94 better option parser, -ps, -l, -s added.
+ *  1.07.94 -r badly needs - as input file.  Per default autoskip over
+ *	       consecutive lines of zeroes, as unix od does.
+ *	    -a shows them too.
+ *	    -i dump as c-style #include "file.h"
+ *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
+ *	    array is written in correct c-syntax.
+ *	    -s improved, now defaults to absolute seek, relative requires a '+'.
+ *	    -r improved, now -r -s -0x... is supported.
+ *	       change/suppress leading '\0' bytes.
+ *	    -l n improved: stops exactly after n bytes.
+ *	    -r improved, better handling of partial lines with trailing garbage.
+ *	    -r improved, now -r -p works again!
+ *	    -r improved, less flushing, much faster now! (that was silly)
+ *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
+ * 15.05.96 -v added. They want to know the version.
+ *	    -a fixed, to show last line inf file ends in all zeros.
+ *	    -u added: Print upper case hex-letters, as preferred by unix bc.
+ *	    -h added to usage message. Usage message extended.
+ *	    Now using outfile if specified even in normal mode, aehem.
+ *	    No longer mixing of ints and longs. May help doze people.
+ *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
+ * 16.05.96 -p improved, removed occasional superfluous linefeed.
+ * 20.05.96 -l 0 fixed. tried to read anyway.
+ * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
+ *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
+ *	    support --gnuish-longhorn-options
+ * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
+ *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
+ *	    xxdline().
+ *  7.06.96 -i printed 'int' instead of 'char'. *blush*
+ *	    added Bram's OS2 ifdefs...
+ * 18.07.96 gcc -Wall @ SunOS4 is now slient.
+ *	    Added osver for MSDOS/DJGPP/WIN32.
+ * 29.08.96 Added size_t to strncmp() for Amiga.
+ * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
+ * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
+ *	    (azc10@yahoo.com)
+ * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
+ * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
+ *	    missing or wrong.
+ * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
+ * 27.10.98 Fixed: -g option parser required blank.
+ *	    option -b added: 01000101 binary output in normal format.
+ * 16.05.00 Added VAXC changes by Stephen P. Wall
+ * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
+ *
+ * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
+ *
+ * Small changes made afterwards by Bram Moolenaar et al.
+ *
+ * Distribute freely and credit me,
+ * make money and share with me,
+ * lose money and don't ask me.
+ *
+ *
+ */
+
+/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
+#if _MSC_VER >= 1400
+# define _CRT_SECURE_NO_DEPRECATE
+# define _CRT_NONSTDC_NO_DEPRECATE
+#endif
+
+#include <stdio.h>
+#ifdef VAXC
+# include <file.h>
+#else
+# include <fcntl.h>
+#endif
+#ifdef __TSC__
+# define MSDOS
+#endif
+#if !defined(OS2) && defined(__EMX__)
+# define OS2
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
+# include <io.h>	/* for setmode() */
+#else
+# ifdef UNIX
+#  include <unistd.h>
+# endif
+#endif
+#include <stdlib.h>
+#include <string.h>	/* for strncmp() */
+#include <ctype.h>	/* for isalnum() */
+#if __MWERKS__ && !defined(BEBOX)
+# include <unix.h>	/* for fdopen() on MAC */
+#endif
+
+#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
+/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
+# define fileno(f)       ((f)->fd)
+FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
+#endif
+
+
+/*  This corrects the problem of missing prototypes for certain functions
+ *  in some GNU installations (e.g. SunOS 4.1.x).
+ *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
+ */
+#if defined(__GNUC__) && defined(__STDC__)
+# ifndef __USE_FIXED_PROTOTYPES__
+#  define __USE_FIXED_PROTOTYPES__
+# endif
+#endif
+
+#ifndef __USE_FIXED_PROTOTYPES__
+/*
+ * This is historic and works only if the compiler really has no prototypes:
+ *
+ * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
+ * FILE is defined on OS 4.x, not on 5.x (Solaris).
+ * if __SVR4 is defined (some Solaris versions), don't include this.
+ */
+#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
+#  define __P(a) a
+/* excerpt from my sun_stdlib.h */
+extern int fprintf __P((FILE *, char *, ...));
+extern int fputs   __P((char *, FILE *));
+extern int _flsbuf __P((unsigned char, FILE *));
+extern int _filbuf __P((FILE *));
+extern int fflush  __P((FILE *));
+extern int fclose  __P((FILE *));
+extern int fseek   __P((FILE *, long, int));
+extern int rewind  __P((FILE *));
+
+extern void perror __P((char *));
+# endif
+#endif
+
+extern long int strtol();
+extern long int ftell();
+
+char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
+#ifdef WIN32
+char osver[] = " (Win32)";
+#else
+# ifdef DJGPP
+char osver[] = " (dos 32 bit)";
+# else
+#  ifdef MSDOS
+char osver[] = " (dos 16 bit)";
+#  else
+char osver[] = "";
+#  endif
+# endif
+#endif
+
+#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
+# define CYGWIN
+#endif
+#if defined(MSDOS) || defined(WIN32) || defined(OS2)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
+# define PATH_SEP '\\'
+#elif defined(CYGWIN)
+# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
+# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
+# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
+# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
+# define PATH_SEP '/'
+#else
+# ifdef VMS
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP ']'
+#  define FILE_SEP '.'
+# else
+#  define BIN_READ(dummy)  "r"
+#  define BIN_WRITE(dummy) "w"
+#  define BIN_CREAT(dummy) O_CREAT
+#  define BIN_ASSIGN(fp, dummy) fp
+#  define PATH_SEP '/'
+# endif
+#endif
+
+/* open has only to arguments on the Mac */
+#if __MWERKS__
+# define OPEN(name, mode, umask) open(name, mode)
+#else
+# define OPEN(name, mode, umask) open(name, mode, umask)
+#endif
+
+#ifdef AMIGA
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
+#else
+# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
+#endif
+
+#ifndef __P
+# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
+        || defined(__BORLANDC__)
+#  define __P(a) a
+# else
+#  define __P(a) ()
+# endif
+#endif
+
+/* Let's collect some prototypes */
+/* CodeWarrior is really picky about missing prototypes */
+static void exit_with_usage __P((char *));
+static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
+static void xxdline __P((FILE *, char *, int));
+
+#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
+#define COLS 256	/* change here, if you ever need more columns */
+#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
+
+char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
+
+/* the different hextypes known by this program: */
+#define HEX_NORMAL 0
+#define HEX_POSTSCRIPT 1
+#define HEX_CINCLUDE 2
+#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
+
+static void
+exit_with_usage(pname)
+char *pname;
+{
+  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
+  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
+  fprintf(stderr, "Options:\n");
+  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
+  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
+  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
+  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
+  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
+  fprintf(stderr, "    -h          print this summary.\n");
+  fprintf(stderr, "    -i          output in C include file style.\n");
+  fprintf(stderr, "    -l len      stop after <len> octets.\n");
+  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
+  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
+  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
+  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
+#ifdef TRY_SEEK
+      "[+][-]", "(or +: rel.) ");
+#else
+      "", "");
+#endif
+  fprintf(stderr, "    -u          use upper case hex letters.\n");
+  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
+  exit(1);
+}
+
+/*
+ * Max. cols binary characters are decoded from the input stream per line.
+ * Two adjacent garbage characters after evaluated data delimit valid data.
+ * Everything up to the next newline is discarded.
+ *
+ * The name is historic and came from 'undo type opt h'.
+ */
+static int
+huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
+FILE *fpi, *fpo, *fperr;
+char *pname;
+int cols, hextype;
+long base_off;
+{
+  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
+  long have_off = 0, want_off = 0;
+
+  rewind(fpi);
+
+  while ((c = getc(fpi)) != EOF)
+    {
+      if (c == '\r')	/* Doze style input file? */
+    continue;
+
+#if 0	/* this doesn't work when there is normal text after the hex codes in
+       the last line that looks like hex */
+      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
+    continue;
+#endif
+
+      n3 = n2;
+      n2 = n1;
+
+      if (c >= '0' && c <= '9')
+    n1 = c - '0';
+      else if (c >= 'a' && c <= 'f')
+    n1 = c - 'a' + 10;
+      else if (c >= 'A' && c <= 'F')
+    n1 = c - 'A' + 10;
+      else
+    {
+      n1 = -1;
+      if (ign_garb)
+        continue;
+    }
+
+      ign_garb = 0;
+
+      if (p >= cols)
+    {
+      if (!hextype)
+        {
+          if (n1 < 0)
+        {
+          p = 0;
+          continue;
+        }
+          want_off = (want_off << 4) | n1;
+          continue;
+        }
+      else
+        p = 0;
+    }
+
+      if (base_off + want_off != have_off)
+    {
+      fflush(fpo);
+#ifdef TRY_SEEK
+      c = fseek(fpo, base_off + want_off - have_off, 1);
+      if (c >= 0)
+        have_off = base_off + want_off;
+#endif
+      if (base_off + want_off < have_off)
+        {
+          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
+          return 5;
+        }
+      for (; have_off < base_off + want_off; have_off++)
+        putc(0, fpo);
+    }
+
+      if (n2 >= 0 && n1 >= 0)
+    {
+      putc((n2 << 4) | n1, fpo);
+      have_off++;
+      want_off++;
+      n1 = -1;
+      if ((++p >= cols) && !hextype)
+        {
+          /* skip rest of line as garbage */
+          want_off = 0;
+          while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+          ign_garb = 1;
+        }
+    }
+      else if (n1 < 0 && n2 < 0 && n3 < 0)
+    {
+      /* already stumbled into garbage, skip line, wait and see */
+      if (!hextype)
+        want_off = 0;
+      while ((c = getc(fpi)) != '\n' && c != EOF)
+        ;
+      ign_garb = 1;
+    }
+    }
+  fflush(fpo);
+#ifdef TRY_SEEK
+  fseek(fpo, 0L, 2);
+#endif
+  fclose(fpo);
+  fclose(fpi);
+  return 0;
+}
+
+/*
+ * Print line l. If nz is false, xxdline regards the line a line of
+ * zeroes. If there are three or more consecutive lines of zeroes,
+ * they are replaced by a single '*' character.
+ *
+ * If the output ends with more than two lines of zeroes, you
+ * should call xxdline again with l being the last line and nz
+ * negative. This ensures that the last line is shown even when
+ * it is all zeroes.
+ *
+ * If nz is always positive, lines are never suppressed.
+ */
+static void
+xxdline(fp, l, nz)
+FILE *fp;
+char *l;
+int nz;
+{
+  static char z[LLEN+1];
+  static int zero_seen = 0;
+
+  if (!nz && zero_seen == 1)
+    strcpy(z, l);
+
+  if (nz || !zero_seen++)
+    {
+      if (nz)
+    {
+      if (nz < 0)
+        zero_seen--;
+      if (zero_seen == 2)
+        fputs(z, fp);
+      if (zero_seen > 2)
+        fputs("*\n", fp);
+    }
+      if (nz >= 0 || zero_seen > 0)
+    fputs(l, fp);
+      if (nz)
+    zero_seen = 0;
+    }
+}
+
+/* This is an EBCDIC to ASCII conversion table */
+/* from a proposed BTL standard April 16, 1979 */
+static unsigned char etoa64[] =
+{
+    0040,0240,0241,0242,0243,0244,0245,0246,
+    0247,0250,0325,0056,0074,0050,0053,0174,
+    0046,0251,0252,0253,0254,0255,0256,0257,
+    0260,0261,0041,0044,0052,0051,0073,0176,
+    0055,0057,0262,0263,0264,0265,0266,0267,
+    0270,0271,0313,0054,0045,0137,0076,0077,
+    0272,0273,0274,0275,0276,0277,0300,0301,
+    0302,0140,0072,0043,0100,0047,0075,0042,
+    0303,0141,0142,0143,0144,0145,0146,0147,
+    0150,0151,0304,0305,0306,0307,0310,0311,
+    0312,0152,0153,0154,0155,0156,0157,0160,
+    0161,0162,0136,0314,0315,0316,0317,0320,
+    0321,0345,0163,0164,0165,0166,0167,0170,
+    0171,0172,0322,0323,0324,0133,0326,0327,
+    0330,0331,0332,0333,0334,0335,0336,0337,
+    0340,0341,0342,0343,0344,0135,0346,0347,
+    0173,0101,0102,0103,0104,0105,0106,0107,
+    0110,0111,0350,0351,0352,0353,0354,0355,
+    0175,0112,0113,0114,0115,0116,0117,0120,
+    0121,0122,0356,0357,0360,0361,0362,0363,
+    0134,0237,0123,0124,0125,0126,0127,0130,
+    0131,0132,0364,0365,0366,0367,0370,0371,
+    0060,0061,0062,0063,0064,0065,0066,0067,
+    0070,0071,0372,0373,0374,0375,0376,0377
+};
+
+const char* extract_filename(const char* path) {
+    const char* filename = strrchr(path, '/');
+    if (filename) {
+        return filename + 1;
+    }
+    return path;
+}
+
+int
+main(argc, argv)
+int argc;
+char *argv[];
+{
+  FILE *fp, *fpo;
+  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
+  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
+  int ebcdic = 0;
+  int octspergrp = -1;	/* number of octets grouped in output */
+  int grplen;		/* total chars per octet group */
+  long length = -1, n = 0, seekoff = 0;
+  char l[LLEN+1];
+  char *pname, *pp;
+
+#ifdef AMIGA
+  /* This program doesn't work when started from the Workbench */
+  if (argc == 0)
+    exit(1);
+#endif
+
+  pname = argv[0];
+  for (pp = pname; *pp; )
+    if (*pp++ == PATH_SEP)
+      pname = pp;
+#ifdef FILE_SEP
+  for (pp = pname; *pp; pp++)
+    if (*pp == FILE_SEP)
+      {
+    *pp = '\0';
+    break;
+      }
+#endif
+
+  while (argc >= 2)
+    {
+      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
+       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
+      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
+      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
+      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
+      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
+      else if (!STRNCMP(pp, "-r", 2)) revert++;
+      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
+      else if (!STRNCMP(pp, "-v", 2))
+    {
+      fprintf(stderr, "%s%s\n", version, osver);
+      exit(0);
+    }
+      else if (!STRNCMP(pp, "-c", 2))
+    {
+      if (pp[2] && STRNCMP("ols", pp + 2, 3))
+        cols = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          cols = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-g", 2))
+    {
+      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
+        octspergrp = (int)strtol(pp + 2, NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          octspergrp = (int)strtol(argv[2], NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-s", 2))
+    {
+      relseek = 0;
+      negseek = 0;
+      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
+        {
+#ifdef TRY_SEEK
+          if (pp[2] == '+')
+        relseek++;
+          if (pp[2+relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
+        }
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+#ifdef TRY_SEEK
+          if (argv[2][0] == '+')
+        relseek++;
+          if (argv[2][relseek] == '-')
+        negseek++;
+#endif
+          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!STRNCMP(pp, "-l", 2))
+    {
+      if (pp[2] && STRNCMP("en", pp + 2, 2))
+        length = strtol(pp + 2, (char **)NULL, 0);
+      else
+        {
+          if (!argv[2])
+        exit_with_usage(pname);
+          length = strtol(argv[2], (char **)NULL, 0);
+          argv++;
+          argc--;
+        }
+    }
+      else if (!strcmp(pp, "--"))	/* end of options */
+    {
+      argv++;
+      argc--;
+      break;
+    }
+      else if (pp[0] == '-' && pp[1])	/* unknown option */
+    exit_with_usage(pname);
+      else
+    break;				/* not an option */
+
+      argv++;				/* advance to next argument */
+      argc--;
+    }
+
+  if (!cols)
+    switch (hextype)
+      {
+      case HEX_POSTSCRIPT:	cols = 30; break;
+      case HEX_CINCLUDE:	cols = 12; break;
+      case HEX_BITS:		cols = 6; break;
+      case HEX_NORMAL:
+      default:			cols = 16; break;
+      }
+
+  if (octspergrp < 0)
+    switch (hextype)
+      {
+      case HEX_BITS:		octspergrp = 1; break;
+      case HEX_NORMAL:		octspergrp = 2; break;
+      case HEX_POSTSCRIPT:
+      case HEX_CINCLUDE:
+      default:			octspergrp = 0; break;
+      }
+
+  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
+                                && (cols > COLS)))
+    {
+      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
+      exit(1);
+    }
+
+  if (octspergrp < 1)
+    octspergrp = cols;
+
+  if (argc > 3)
+    exit_with_usage(pname);
+
+  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
+    BIN_ASSIGN(fp = stdin, !revert);
+  else
+    {
+      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
+    {
+      fprintf(stderr,"%s: ", pname);
+      perror(argv[1]);
+      return 2;
+    }
+    }
+
+  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
+    BIN_ASSIGN(fpo = stdout, revert);
+  else
+    {
+      int fd;
+      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
+
+      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
+      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
+    {
+      fprintf(stderr, "%s: ", pname);
+      perror(argv[2]);
+      return 3;
+    }
+      rewind(fpo);
+    }
+
+  if (revert)
+    {
+      if (hextype && (hextype != HEX_POSTSCRIPT))
+    {
+      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
+      return -1;
+    }
+      return huntype(fp, fpo, stderr, pname, cols, hextype,
+        negseek ? -seekoff : seekoff);
+    }
+
+  if (seekoff || negseek || !relseek)
+    {
+#ifdef TRY_SEEK
+      if (relseek)
+    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
+      else
+    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
+      if (e < 0 && negseek)
+    {
+      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
+      return 4;
+    }
+      if (e >= 0)
+    seekoff = ftell(fp);
+      else
+#endif
+    {
+      long s = seekoff;
+
+      while (s--)
+        (void)getc(fp);
+    }
+    }
+
+  if (hextype == HEX_CINCLUDE)
+    {
+      const char* filename = extract_filename(argv[1]);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fputs("[] = {\n", fpo);
+    }
+
+      p = 0;
+      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
+    {
+      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
+        (p % cols) ? ", " : ",\n  "+2*!p,  c);
+      p++;
+    }
+
+      if (p)
+    fputs("\n};\n"+3*(fp == stdin), fpo);
+
+      if (fp != stdin)
+    {
+      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
+      for (e = 0; (c = filename[e]) != 0; e++)
+        putc(isalnum(c) ? c : '_', fpo);
+      fprintf(fpo, "_len = %d;\n", p);
+    }
+
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  if (hextype == HEX_POSTSCRIPT)
+    {
+      p = cols;
+      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      putchar(hexx[(e >> 4) & 0xf]);
+      putchar(hexx[(e     ) & 0xf]);
+      n++;
+      if (!--p)
+        {
+          putchar('\n');
+          p = cols;
+        }
+    }
+      if (p < cols)
+    putchar('\n');
+      fclose(fp);
+      fclose(fpo);
+      return 0;
+    }
+
+  /* hextype: HEX_NORMAL or HEX_BITS */
+
+  if (hextype == HEX_NORMAL)
+    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
+  else	/* hextype == HEX_BITS */
+    grplen = 8 * octspergrp + 1;
+
+  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
+    {
+      if (p == 0)
+    {
+      sprintf(l, "%07lx: ", n + seekoff);
+      for (c = 9; c < LLEN; l[c++] = ' ');
+    }
+      if (hextype == HEX_NORMAL)
+    {
+      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
+      l[++c]			       = hexx[ e       & 0xf];
+    }
+      else /* hextype == HEX_BITS */
+    {
+      int i;
+
+      c = (9 + (grplen * p) / octspergrp) - 1;
+      for (i = 7; i >= 0; i--)
+        l[++c] = (e & (1 << i)) ? '1' : '0';
+    }
+      if (ebcdic)
+    e = (e < 64) ? '.' : etoa64[e-64];
+      /* When changing this update definition of LLEN above. */
+      l[11 + (grplen * cols - 1)/octspergrp + p] =
+#ifdef __MVS__
+      (e >= 64)
+#else
+      (e > 31 && e < 127)
+#endif
+      ? e : '.';
+      if (e)
+    nonzero++;
+      n++;
+      if (++p == cols)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, autoskip ? nonzero : 1);
+      nonzero = 0;
+      p = 0;
+    }
+    }
+  if (p)
+    {
+      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
+      xxdline(fpo, l, 1);
+    }
+  else if (autoskip)
+    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
+
+  fclose(fp);
+  fclose(fpo);
+  return 0;
+}
diff --git a/kompute/kompute-config.cmake b/kompute/kompute-config.cmake
new file mode 100644
index 000000000..10425252c
--- /dev/null
+++ b/kompute/kompute-config.cmake
@@ -0,0 +1,28 @@
+# General purpose GPU compute framework built on Vulkan to
+# support 1000s of cross vendor graphics cards
+# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
+# asynchronous and optimized for advanced GPU data processing use cases.
+# Backed by the Linux Foundation. 
+#
+# Finding this module will define the following variables:
+#  KOMPUTE_FOUND - True if the core library has been found
+#  KOMPUTE_LIBRARIES - Path to the core library archive
+#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
+#                     to kompute.h, as a single include which must be included in every
+#                     file that uses this interface. Else it also points to the
+#                     directory for individual includes.
+
+find_path(KOMPUTE_INCLUDE_DIR
+          NAMES kompute.h)
+
+find_library(KOMPUTE_LIBRARY
+             NAMES kompute
+             HINTS ${KOMPUTE_LIBRARY_ROOT})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
+
+if(KOMPUTE_FOUND)
+    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
+    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
+endif()
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
new file mode 100644
index 000000000..7e4e43d75
--- /dev/null
+++ b/kompute/op_add.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
new file mode 100644
index 000000000..492f672e5
--- /dev/null
+++ b/kompute/op_addrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
new file mode 100644
index 000000000..40d756ae5
--- /dev/null
+++ b/kompute/op_cpy_f16_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
new file mode 100644
index 000000000..309c48aed
--- /dev/null
+++ b/kompute/op_cpy_f16_f32.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float16_t
+#define IN_TYPE_SIZE 2
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
new file mode 100644
index 000000000..fb0e00d67
--- /dev/null
+++ b/kompute/op_cpy_f32_f16.comp
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float16_t
+#define OUT_TYPE_SIZE 2
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
new file mode 100644
index 000000000..f43480b8d
--- /dev/null
+++ b/kompute/op_cpy_f32_f32.comp
@@ -0,0 +1,168 @@
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+#define IN_TYPE float
+#define IN_TYPE_SIZE 4
+#define OUT_TYPE float
+#define OUT_TYPE_SIZE 4
+
+layout(local_size_x = nth) in;
+
+layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    int ne1;
+    int ne2;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
+
+    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
+    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
+    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
+    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
+
+    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
+        out_[dst_data+i00] = OUT_TYPE(in_[src]);
+    }
+}
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
new file mode 100644
index 000000000..18b0192d7
--- /dev/null
+++ b/kompute/op_diagmask.comp
@@ -0,0 +1,153 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int ne00;
+    int ne01;
+} pcs;
+
+void main() {
+    const uint i02 = gl_WorkGroupID.z;
+    const uint i01 = gl_WorkGroupID.y;
+    const uint i00 = gl_WorkGroupID.x;
+
+    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
+
+    if (i00 > pcs.n_past + i01) {
+        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
+    } else {
+        out_[index + pcs.outOff] = in_[index + pcs.inOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
new file mode 100644
index 000000000..8079b8ef2
--- /dev/null
+++ b/kompute/op_gelu.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+}
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
new file mode 100644
index 000000000..e0f5bb16e
--- /dev/null
+++ b/kompute/op_getrows_f16.comp
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    for (int j = 0; j < pcs.ne00; j++) {
+        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
+    }
+}
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
new file mode 100644
index 000000000..cddba929b
--- /dev/null
+++ b/kompute/op_getrows_q4_0.comp
@@ -0,0 +1,179 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_0 get_unaligned_block_q4_0(uint index) {
+    block_q4_0 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_0;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F) - 8;
+            const int x1 = (block.qs[j] >>   4) - 8;
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d;
+            out_[y+i*qk + j + qk/2] = float(x1)*d;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
new file mode 100644
index 000000000..151848a9d
--- /dev/null
+++ b/kompute/op_getrows_q4_1.comp
@@ -0,0 +1,181 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    const uint qk = QK4_1;
+
+    const uint nb = k / qk;
+
+    for (uint i = 0; i < nb; i++) {
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+
+        const float16_t d = block.d;
+        const float16_t m = block.m;
+
+        for (uint j = 0; j < qk/2; ++j) {
+            const int x0 = (block.qs[j] & 0x0F);
+            const int x1 = (block.qs[j] >>   4);
+
+            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
+            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
+        }
+    }
+}
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
+}
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
new file mode 100644
index 000000000..4907015d8
--- /dev/null
+++ b/kompute/op_mul.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
new file mode 100644
index 000000000..f1198b593
--- /dev/null
+++ b/kompute/op_mul_mat_f16.comp
@@ -0,0 +1,177 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 64) in;
+
+layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    uint nb01;
+    uint nb02;
+    uint nb11;
+    uint nb12;
+    int ne0;
+    int ne1;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x];
+
+void main() {
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+
+    sum[gl_LocalInvocationID.x] = 0.0;
+
+    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
+        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
+    }
+
+    // accumulate the sum from all threads in the threadgroup
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    if (gl_LocalInvocationID.x == 0) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
new file mode 100644
index 000000000..206aea7d5
--- /dev/null
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -0,0 +1,195 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[64];
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_0);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
+        const float d = float(u8BufToFloat16(inA, index));
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_0 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            const uint8_t b = inA[index+2+xl+j];
+            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
+            acc.y += inB[yl+j] + inB[yl+j+16];
+        }
+
+        sumf += d * (acc.x - 8.*acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    if (ith == 0) {
+        float sumTotal = 0.0;
+        for (uint i = 0; i < nth; ++i) {
+            sumTotal += sum[i];
+        }
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+    }
+}
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
new file mode 100644
index 000000000..8bdf810a1
--- /dev/null
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -0,0 +1,218 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 8, local_size_y = 8) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+} pcs;
+
+shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
+
+#define UNALIGNED_INPUT inA
+
+block_q4_1 get_unaligned_block_q4_1(uint index) {
+    block_q4_1 fres;
+    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
+        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+    }
+    return fres;
+}
+
+void main() {
+    const uint nb = uint(pcs.ne00/QK4_1);
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+
+    const uint x = r0*nb; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
+
+    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
+    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
+
+    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
+    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
+
+    const uint first = 4 * iy;
+
+    float sumf = 0.0;
+
+    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
+        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
+
+        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
+
+        const float d = float(block.d);
+        const float m = float(block.m);
+
+        const uint xl = first; // Based from bl->qs
+        const uint yl = y + i * QK4_1 + first; // Based from inB
+
+        vec2 acc = vec2(0.0, 0.0);
+
+        for (int j = 0; j < 4; ++j) {
+            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
+            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
+        }
+
+        sumf += d * (acc.x - acc.y);
+    }
+
+    sum[ith] = sumf;
+
+    //
+    // Accumulate the sum from all threads in the threadgroup
+    //
+    barrier();
+    memoryBarrierShared();
+    if (ith%4 == 0) {
+        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith%16 == 0) {
+        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
+    }
+    barrier();
+    memoryBarrierShared();
+    if (ith == 0) {
+        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
+        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
+    }
+}
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
new file mode 100644
index 000000000..3defd0a5f
--- /dev/null
+++ b/kompute/op_mulrow.comp
@@ -0,0 +1,145 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    uint row;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+}
\ No newline at end of file
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
new file mode 100644
index 000000000..ec0a8568d
--- /dev/null
+++ b/kompute/op_norm.comp
@@ -0,0 +1,209 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+    // MEAN
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float mean = sum[0];
+
+    // recenter
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] - mean;
+    }
+
+    // VARIANCE
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+    const float variance = sum[0];
+
+    const float scale = 1.0f/sqrt(variance + pcs.eps);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] *= scale;
+    }
+}
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
new file mode 100644
index 000000000..bc2c31f43
--- /dev/null
+++ b/kompute/op_relu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+}
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
new file mode 100644
index 000000000..784713c36
--- /dev/null
+++ b/kompute/op_rmsnorm.comp
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 256
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    uint ne00;
+    uint nb01;
+    float eps;
+} pcs;
+
+shared float sum[nth];
+
+void main() {
+    const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
+
+    // parallel sum
+    sum[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    if (gl_LocalInvocationID.x == 0) {
+        sum[0] /= float(pcs.ne00);
+    }
+    barrier();
+    memoryBarrierShared();
+
+    const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
+
+    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[y+i00] = in_[x+i00] * scale;
+    }
+}
diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
new file mode 100644
index 000000000..ca6bb6831
--- /dev/null
+++ b/kompute/op_rope.comp
@@ -0,0 +1,183 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorIn { float in_[]; };
+layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inOff;
+    uint outOff;
+    uint n_past;
+    int n_dims;
+    int mode;
+    float freq_base;
+    float freq_scale;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    const bool is_neox = (pcs.mode & 2) != 0;
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
+
+    float theta = pcs.freq_scale * float(p);
+
+    if (!is_neox) {
+        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+            const float x0 = in_[src];
+            const float x1 = in_[src+1];
+
+            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
+        }
+    } else {
+        // TODO: implement
+    }
+}
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
new file mode 100644
index 000000000..f537121a4
--- /dev/null
+++ b/kompute/op_scale.comp
@@ -0,0 +1,142 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    float scale;
+} pcs;
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+
+    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+}
\ No newline at end of file
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
new file mode 100644
index 000000000..90c034ac7
--- /dev/null
+++ b/kompute/op_silu.comp
@@ -0,0 +1,141 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+} pcs;
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const float x = in_[i + pcs.inOff];
+
+    out_[i + pcs.outOff] = x / (1.0 + exp(-x));
+}
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
new file mode 100644
index 000000000..ce0e71924
--- /dev/null
+++ b/kompute/op_softmax.comp
@@ -0,0 +1,197 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
+
+#define nth 32
+
+layout(local_size_x = nth) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    int ne00;
+    int ne01;
+    int ne02;
+} pcs;
+
+shared float buf[nth];
+
+void main() {
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
+
+    const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00;
+    const uint psrc0 = extra_off + pcs.inOff; // Based from in_
+    const uint pdst = extra_off + pcs.outOff; // Based from out_
+
+    // parallel max
+    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float max_ = buf[0];
+
+    // parallel sum
+    buf[gl_LocalInvocationID.x] = 0.0;
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    }
+
+    // reduce
+    barrier();
+    memoryBarrierShared();
+    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+        if (gl_LocalInvocationID.x < i) {
+            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
+        }
+        barrier();
+        memoryBarrierShared();
+    }
+
+    // broadcast
+    const float sum = buf[0];
+
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    }
+}
diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
new file mode 100644
index 000000000..9375b6701
--- /dev/null
+++ b/kompute/scripts/convert_shaders.py
@@ -0,0 +1,148 @@
+"""
+    Script to handle conversion of compute shaders to spirv and to headers
+"""
+import os
+import sys
+import logging
+import click
+import subprocess
+
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler())
+
+is_windows = sys.platform.startswith('win')
+
+CWD=os.path.dirname(os.path.abspath(__file__))
+XXD_LINUX_CMD="xxd"
+XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
+
+SHADER_GENERATED_NOTICE = """/*
+    THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
+
+    ---
+
+    Copyright 2020 The Institute for Ethical AI & Machine Learning
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+"""
+
+@click.command()
+@click.option(
+    "--shader-path",
+    "-p",
+    envvar="KOMPUTE_SHADER_PATH",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--shader-binary",
+    "-s",
+    envvar="KOMPUTE_SHADER_BINARY",
+    required=True,
+    help="The path for the directory to build and convert shaders",
+)
+@click.option(
+    "--header-path",
+    "-c",
+    envvar="KOMPUTE_HEADER_PATH",
+    default="",
+    required=False,
+    help="The (optional) output file for the cpp header files",
+)
+@click.option(
+    "--verbose",
+    "-v",
+    envvar="KOMPUTE_HEADER_PATH",
+    default=False,
+    is_flag=True,
+    help="Enable versbosity if flag is provided",
+)
+def run_cli(
+    shader_path: str = None,
+    shader_binary: str = None,
+    header_path: bool = None,
+    verbose: bool = None,
+):
+    """
+    CLI function for shader generation
+    """
+
+    if verbose:
+        logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.WARNING)
+
+    logger.debug(f"Starting script with variables: {locals()}")
+
+    if is_windows:
+        logger.debug(f"Running on windows, converting input paths")
+        shader_path = shader_path.replace("/", "\\")
+        header_path = header_path.replace("/", "\\")
+
+    shader_files = []
+    for root, directory, files in os.walk(shader_path):
+        for file in files:
+            if file.endswith(".comp"):
+                shader_files.append(os.path.join(root, file))
+
+    run_cmd = lambda *args: subprocess.check_output([*args]).decode()
+
+    logger.debug(f"Output spirv path: {shader_path}")
+    logger.debug(f"Converting files to spirv: {shader_files}")
+
+    spirv_files = []
+    for file in shader_files:
+        logger.debug(f"Converting to spirv: {file}")
+        spirv_file = f"{file}.spv"
+        run_cmd(shader_binary, "-V", file, "-o", spirv_file)
+        spirv_files.append(spirv_file)
+
+    # Create cpp files if header_path provided
+    if header_path:
+        logger.debug(f"Header path provided. Converting bin files to hpp.")
+        logger.debug(f"Output header path: {shader_path}")
+
+        # Check if xxd command options are available
+        if is_windows:
+            xxd_cmd = XXD_WINDOWS_CMD
+        else:
+            xxd_cmd = XXD_LINUX_CMD
+
+        for file in spirv_files:
+            print(xxd_cmd)
+            header_data = str(run_cmd(xxd_cmd, "-i", file))
+            # Ensuring the variable is a static const unsigned
+            header_data = header_data.replace("unsigned", "static const unsigned")
+            if is_windows:
+                raw_file_name = file.split("\\")[-1]
+            else:
+                raw_file_name = file.split("/")[-1]
+            file_name = f"shader{raw_file_name}"
+            header_file = file_name.replace(".comp.spv", ".hpp")
+            header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
+            logger.debug(f"Converting to hpp: {file_name}")
+            with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
+                fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
+                fstream.write(f"#ifndef {header_file_define}\n")
+                fstream.write(f"#define {header_file_define}\n\n")
+                fstream.write("namespace kp {\n")
+                fstream.write("namespace shader_data {\n")
+                fstream.write(f"{header_data}")
+                fstream.write("}\n")
+                fstream.write("}\n")
+                fstream.write(f"#endif // define {header_file_define}\n")
+
+
+if __name__ == "__main__":
+    run_cli()
diff --git a/kompute/scripts/requirements.txt b/kompute/scripts/requirements.txt
new file mode 100644
index 000000000..4da042504
--- /dev/null
+++ b/kompute/scripts/requirements.txt
@@ -0,0 +1,11 @@
+# CLI dependencies
+click==7.1.2
+
+# Dev dependencies
+black==19.10b0
+quom==1.2.0
+Sphinx==3.2.1
+sphinx_material==0.0.30
+breathe==4.20.0
+m2r2==0.2.5
+git+git://github.com/pybind/pybind11_mkdoc.git@master
diff --git a/kompute/setup.py b/kompute/setup.py
new file mode 100644
index 000000000..09faa8d1a
--- /dev/null
+++ b/kompute/setup.py
@@ -0,0 +1,93 @@
+import os
+import re
+import platform
+import sys
+import sysconfig
+import subprocess
+
+from setuptools import setup, Extension
+from setuptools.command.build_ext import build_ext
+from distutils.version import LooseVersion
+
+curr_dir = os.path.abspath(os.path.dirname(__file__))
+with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
+    long_description = f.read()
+
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=''):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+class CMakeBuild(build_ext):
+    def run(self):
+        try:
+            out = subprocess.check_output(['cmake', '--version'])
+        except OSError:
+            raise RuntimeError("CMake must be installed to build the following extensions: " +
+                               ", ".join(e.name for e in self.extensions))
+
+        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
+        if cmake_version < '3.15':
+            raise RuntimeError("CMake >= 3.15 is required")
+
+        for ext in self.extensions:
+            self.build_extension(ext)
+
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+        # required for auto-detection of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
+                      '-DKOMPUTE_OPT_BUILD_PYTHON=ON',
+                      '-DKOMPUTE_OPT_LOG_LEVEL=Off',
+                      '-DKOMPUTE_OPT_USE_SPDLOG=Off',
+                      '-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+                      '-DPYTHON_EXECUTABLE=' + sys.executable,
+                      '-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
+                      '-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
+        ]
+
+        cfg = 'Debug' if self.debug else 'Release'
+        build_args = ['--config', cfg]
+
+        env = os.environ.copy()
+        oldCxxFlags = env.get('CXXFLAGS', '')
+        env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
+
+        if platform.system() == "Windows":
+            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
+            if sys.maxsize > 2**32:
+                cmake_args += ['-A', 'x64']
+            build_args += ['--', '/m']
+        else:
+            env['CXXFLAGS'] += ' -fPIC'
+            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
+            build_args += ['--', '-j']
+            # Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
+            if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
+                build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
+
+        if not os.path.exists(self.build_temp):
+            os.makedirs(self.build_temp)
+
+        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
+        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
+
+setup(
+    name='kp',
+    version='0.8.1',
+    author='Alejandro Saucedo',
+    description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    ext_modules=[CMakeExtension('kp')],
+    install_requires=[
+        "numpy<2.0.0"
+    ],
+    cmdclass=dict(build_ext=CMakeBuild),
+    zip_safe=False,
+    include_package_data=True,
+)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
new file mode 100644
index 000000000..9c41ec90f
--- /dev/null
+++ b/kompute/src/Algorithm.cpp
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include <fstream>
+
+#include "kompute/Algorithm.hpp"
+
+namespace kp {
+
+Algorithm::~Algorithm()
+{
+    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
+
+    this->destroy();
+}
+
+bool
+Algorithm::isInit()
+{
+    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
+           this->mDescriptorPool && this->mDescriptorSet &&
+           this->mDescriptorSetLayout && this->mShaderModule;
+}
+
+void
+Algorithm::destroy()
+{
+    // We don't have to free memory on destroy as it's freed by the
+    // commandBuffer destructor if (this->mPushConstantsData) {
+    //     free(this->mPushConstantsData);
+    // }
+    // if (this->mSpecializationConstantsData) {
+    //     free(this->mSpecializationConstantsData);
+    // }
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
+                    "Device pointer");
+        return;
+    }
+
+    if (this->mFreePipeline && this->mPipeline) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
+        if (!this->mPipeline) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipeline,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipeline = nullptr;
+    }
+
+    if (this->mFreePipelineCache && this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
+    if (this->mFreePipelineLayout && this->mPipelineLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
+        if (!this->mPipelineLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "pipeline layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineLayout = nullptr;
+    }
+
+    if (this->mFreeShaderModule && this->mShaderModule) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
+        if (!this->mShaderModule) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
+                        "module but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mShaderModule,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mShaderModule = nullptr;
+    }
+
+    freeParameters();
+}
+
+void
+Algorithm::freeParameters()
+{
+    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
+        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
+        if (!this->mDescriptorSetLayout) {
+            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
+                        "descriptor set layout but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mDescriptorSetLayout,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDescriptorSetLayout = nullptr;
+    }
+}
+
+void
+Algorithm::createParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        descriptorSetBindings.push_back(
+          vk::DescriptorSetLayoutBinding(i, // Binding index
+                                         vk::DescriptorType::eStorageBuffer,
+                                         1, // Descriptor count
+                                         vk::ShaderStageFlagBits::eCompute));
+    }
+
+    // This is the component that is fed into the pipeline
+    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
+      vk::DescriptorSetLayoutCreateFlags(),
+      static_cast<uint32_t>(descriptorSetBindings.size()),
+      descriptorSetBindings.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
+    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
+    vk::Result result = this->mDevice->createDescriptorSetLayout(
+      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
+
+   if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSetLayout = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::updateParameters()
+{
+    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
+    if (!*this->mDescriptorPool) {
+        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
+        return;
+    }
+
+    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
+      *this->mDescriptorPool,
+      1, // Descriptor set layout count
+      this->mDescriptorSetLayout.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
+    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
+    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
+                                          this->mDescriptorSet.get());
+
+    if (result != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
+    } else {
+        this->mFreeDescriptorSet = true;
+        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
+    }
+
+    this->mFreeDescriptorSet = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
+
+        vk::DescriptorBufferInfo descriptorBufferInfo =
+          this->mTensors[i]->constructDescriptorBufferInfo();
+
+        computeWriteDescriptorSets.push_back(
+          vk::WriteDescriptorSet(*this->mDescriptorSet,
+                                 i, // Destination binding
+                                 0, // Destination array element
+                                 1, // Descriptor count
+                                 vk::DescriptorType::eStorageBuffer,
+                                 nullptr, // Descriptor image info
+                                 &descriptorBufferInfo));
+
+        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
+                                            nullptr);
+    }
+
+    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
+}
+
+void
+Algorithm::createShaderModule()
+{
+    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
+
+    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
+                                                sizeof(uint32_t) *
+                                                  this->mSpirv.size(),
+                                                this->mSpirv.data());
+
+    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
+                 this->mSpirv.size());
+    this->mFreeShaderModule = true;
+    this->mShaderModule = std::make_shared<vk::ShaderModule>();
+    this->mDevice->createShaderModule(
+      &shaderModuleInfo, nullptr, this->mShaderModule.get());
+    this->mFreeShaderModule = true;
+
+    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
+}
+
+void
+Algorithm::createPipeline()
+{
+    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
+
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
+      vk::PipelineLayoutCreateFlags(),
+      1, // Set layout count
+      this->mDescriptorSetLayout.get());
+
+    vk::PushConstantRange pushConstantRange;
+    if (this->mPushConstantsSize) {
+        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
+        pushConstantRange.setOffset(0);
+        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
+                                  this->mPushConstantsSize);
+
+        pipelineLayoutInfo.setPushConstantRangeCount(1);
+        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
+    }
+
+    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
+    this->mDevice->createPipelineLayout(
+      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
+    this->mFreePipelineLayout = true;
+
+    std::vector<vk::SpecializationMapEntry> specializationEntries;
+
+    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
+        vk::SpecializationMapEntry specializationEntry(
+          static_cast<uint32_t>(i),
+          static_cast<uint32_t>(
+            this->mSpecializationConstantsDataTypeMemorySize * i),
+          this->mSpecializationConstantsDataTypeMemorySize);
+
+        specializationEntries.push_back(specializationEntry);
+    }
+
+    // This passes ownership of the memory so we remove ownership from
+    // specialization container by using "transferDataOwnership"
+    vk::SpecializationInfo specializationInfo(
+      static_cast<uint32_t>(specializationEntries.size()),
+      specializationEntries.data(),
+      this->mSpecializationConstantsDataTypeMemorySize *
+        this->mSpecializationConstantsSize,
+      this->mSpecializationConstantsData);
+
+    vk::PipelineShaderStageCreateInfo shaderStage(
+      vk::PipelineShaderStageCreateFlags(),
+      vk::ShaderStageFlagBits::eCompute,
+      *this->mShaderModule,
+      "main",
+      &specializationInfo);
+
+    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
+    if(!*globalPipelineCache) {
+       vk::PipelineCacheCreateInfo pipelineCacheInfo =
+         vk::PipelineCacheCreateInfo();
+      this->mPipelineCache = globalPipelineCache;
+      this->mFreePipelineCache = true;
+      this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
+    }
+
+    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
+                                               shaderStage,
+                                               *this->mPipelineLayout,
+                                               vk::Pipeline(),
+                                               0);
+
+#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
+    vk::ResultValue<vk::Pipeline> pipelineResult =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+
+    if (pipelineResult.result != vk::Result::eSuccess) {
+        throw std::runtime_error("Failed to create pipeline result: " +
+                                 vk::to_string(pipelineResult.result));
+    }
+
+    vk::Pipeline& pipeline = pipelineResult.value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#else
+    vk::Pipeline pipeline =
+      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+        .value;
+    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
+    this->mFreePipeline = true;
+#endif
+
+    // TODO: Update to consistent
+    // this->mPipeline = std::make_shared<vk::Pipeline>();
+    // this->mDevice->createComputePipelines(
+    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
+    //         this->mPipeline.get());
+
+    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
+}
+
+void
+Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
+
+    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
+                               *this->mPipeline);
+
+    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
+
+    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
+                                     *this->mPipelineLayout,
+                                     0, // First set
+                                     *this->mDescriptorSet,
+                                     nullptr // Dispatcher
+    );
+}
+
+void
+Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
+{
+    if (this->mPushConstantsSize) {
+        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
+                     this->mPushConstantsSize *
+                       this->mPushConstantsDataTypeMemorySize);
+
+        commandBuffer.pushConstants(*this->mPipelineLayout,
+                                    vk::ShaderStageFlagBits::eCompute,
+                                    0,
+                                    this->mPushConstantsSize *
+                                      this->mPushConstantsDataTypeMemorySize,
+                                    this->mPushConstantsData);
+    }
+}
+
+void
+Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
+
+    commandBuffer.dispatch(
+      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
+}
+
+void
+Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
+{
+
+    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
+
+    // The dispatch size is set up based on either explicitly provided template
+    // parameters or by default it would take the shape and size of the tensors
+    if (workgroup[0] > 0) {
+        // If at least the x value is provided we use mainly the parameters
+        // provided
+        this->mWorkgroup = { workgroup[0],
+                             workgroup[1] > 0 ? workgroup[1] : 1,
+                             workgroup[2] > 0 ? workgroup[2] : 1 };
+    } else {
+        this->mWorkgroup = { minSize, 1, 1 };
+    }
+
+    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
+                this->mWorkgroup[0],
+                this->mWorkgroup[1],
+                this->mWorkgroup[2]);
+}
+
+const Workgroup&
+Algorithm::getWorkgroup()
+{
+    return this->mWorkgroup;
+}
+
+const std::vector<std::shared_ptr<Tensor>>&
+Algorithm::getTensors()
+{
+    return this->mTensors;
+}
+
+void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    this->mTensors = tensors;
+}
+
+}
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
new file mode 100644
index 000000000..f4f8440f4
--- /dev/null
+++ b/kompute/src/CMakeLists.txt
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20)
+
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    find_library(android android)
+endif()
+
+cmake_minimum_required(VERSION 3.20)
+
+add_library(kompute Algorithm.cpp
+    Manager.cpp
+    OpAlgoDispatch.cpp
+    OpMemoryBarrier.cpp
+    OpTensorCopy.cpp
+    OpTensorSyncDevice.cpp
+    OpTensorSyncLocal.cpp
+    OpBufferSyncDevice.cpp
+    OpBufferSyncLocal.cpp
+    Sequence.cpp
+    Tensor.cpp
+    Core.cpp)
+
+add_library(kompute::kompute ALIAS kompute)
+
+# Set version for shared libraries.
+set_target_properties(kompute
+    PROPERTIES
+    VERSION ${${PROJECT_NAME}_VERSION}
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+
+# Import GNU common install directory variables
+include(GNUInstallDirs)
+
+install(TARGETS kompute
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+# Include CMake helpers for package config files
+# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
+include(CMakePackageConfigHelpers)
+
+configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
+    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+
+# ####################################################
+# Linking
+# ####################################################
+if(KOMPUTE_OPT_ANDROID_BUILD)
+    target_link_libraries(kompute PUBLIC vulkanAndroid
+        android
+        kp_logger
+        kp_shader
+        fmt::fmt)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+        kp_logger
+        kp_shader
+        fmt::fmt)
+endif()
+
+if(KOMPUTE_OPT_BUILD_PYTHON)
+    include_directories(${PYTHON_INCLUDE_DIRS})
+
+    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
+endif()
+
+if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
+    target_link_libraries(kompute PUBLIC Vulkan-Headers)
+endif()
+
+# ####################################################
+# Misc
+# ####################################################
+add_subdirectory(logger)
+add_subdirectory(shaders)
+add_subdirectory(include)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
new file mode 100644
index 000000000..60849a3ec
--- /dev/null
+++ b/kompute/src/Core.cpp
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Core.hpp"
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+/**
+ * Ensures support for dynamic loading of Vulkan functions on Android.
+ * Acts as a default store for loaded functions.
+ * More information:
+ * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
+ **/
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+namespace kp {
+} // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
new file mode 100644
index 000000000..07514ed9a
--- /dev/null
+++ b/kompute/src/Manager.cpp
@@ -0,0 +1,493 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Manager.hpp"
+#include "fmt/format.h"
+#include "kompute/logger/Logger.hpp"
+#include <fmt/core.h>
+#include <iterator>
+#include <set>
+#include <sstream>
+#include <string>
+
+namespace kp {
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+static VKAPI_ATTR VkBool32 VKAPI_CALL
+debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
+                     VkDebugReportObjectTypeEXT /*objectType*/,
+                     uint64_t /*object*/,
+                     size_t /*location*/,
+                     int32_t /*messageCode*/,
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+                     const char* pLayerPrefix,
+                     const char* pMessage,
+#else
+                     const char* /*pLayerPrefix*/,
+                     const char* /*pMessage*/,
+#endif
+                     void* /*pUserData*/)
+{
+    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
+    return VK_FALSE;
+}
+#endif
+
+Manager::Manager()
+{
+    this->mManageResources = true;
+
+// Make sure the logger is setup
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+    logger::setupLogger();
+#endif
+    this->createInstance();
+}
+
+void Manager::initializeDevice(uint32_t physicalDeviceIndex,
+                               const std::vector<uint32_t>& familyQueueIndices,
+                               const std::vector<std::string>& desiredExtensions)
+{
+    this->createDevice(
+      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
+}
+
+Manager::~Manager()
+{
+    KP_LOG_DEBUG("Kompute Manager Destructor started");
+    this->destroy();
+}
+
+void
+Manager::destroy()
+{
+
+    KP_LOG_DEBUG("Kompute Manager destroy() started");
+
+    if (this->mDevice == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mManageResources && this->mManagedSequences.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
+                     "managed sequences");
+        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
+            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
+                sq->destroy();
+            }
+        }
+        this->mManagedSequences.clear();
+    }
+
+    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
+        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
+             this->mManagedAlgorithms) {
+            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+                algorithm->destroy();
+            }
+        }
+        this->mManagedAlgorithms.clear();
+    }
+
+    if (this->mManageResources && this->mManagedTensors.size()) {
+        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
+        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
+            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
+                tensor->destroy();
+            }
+        }
+        this->mManagedTensors.clear();
+    }
+
+    if (this->mFreeDevice) {
+        KP_LOG_INFO("Destroying device");
+        this->mDevice->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mDevice = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
+    }
+
+    if (this->mInstance == nullptr) {
+        KP_LOG_ERROR(
+          "Kompute Manager destructor reached with null Instance pointer");
+        return;
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    if (this->mDebugReportCallback) {
+        this->mInstance->destroyDebugReportCallbackEXT(
+          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
+        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
+    }
+#endif
+
+    if (this->mFreeInstance) {
+        this->mInstance->destroy(
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mInstance = nullptr;
+        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
+    }
+}
+
+void
+Manager::createInstance()
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating instance");
+
+    this->mFreeInstance = true;
+
+    vk::ApplicationInfo applicationInfo;
+    applicationInfo.pApplicationName = "Kompute";
+    applicationInfo.pEngineName = "Kompute";
+    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
+    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
+
+    std::vector<const char*> applicationExtensions;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
+#endif
+
+    vk::InstanceCreateInfo computeInstanceCreateInfo;
+    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
+    if (!applicationExtensions.empty()) {
+        computeInstanceCreateInfo.enabledExtensionCount =
+          (uint32_t)applicationExtensions.size();
+        computeInstanceCreateInfo.ppEnabledExtensionNames =
+          applicationExtensions.data();
+    }
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
+    // We'll identify the layers that are supported
+    std::vector<const char*> validLayerNames;
+    std::vector<const char*> desiredLayerNames = {
+        "VK_LAYER_LUNARG_assistant_layer",
+        "VK_LAYER_LUNARG_standard_validation",
+        "VK_LAYER_KHRONOS_validation",
+    };
+    std::vector<std::string> envLayerNames;
+    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
+    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
+        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
+                     envLayerNamesVal);
+        std::istringstream iss(envLayerNamesVal);
+        std::istream_iterator<std::string> beg(iss);
+        std::istream_iterator<std::string> end;
+        envLayerNames = std::vector<std::string>(beg, end);
+        for (const std::string& layerName : envLayerNames) {
+            desiredLayerNames.push_back(layerName.c_str());
+        }
+        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
+    }
+
+    // Identify the valid layer names based on the desiredLayerNames
+    {
+        std::set<std::string> uniqueLayerNames;
+        std::vector<vk::LayerProperties> availableLayerProperties =
+          vk::enumerateInstanceLayerProperties();
+        for (vk::LayerProperties layerProperties : availableLayerProperties) {
+            std::string layerName(layerProperties.layerName.data());
+            uniqueLayerNames.insert(layerName);
+        }
+        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
+        for (const char* desiredLayerName : desiredLayerNames) {
+            if (uniqueLayerNames.count(desiredLayerName) != 0) {
+                validLayerNames.push_back(desiredLayerName);
+            }
+        }
+    }
+
+    if (!validLayerNames.empty()) {
+        KP_LOG_DEBUG(
+          "Kompute Manager Initializing instance with valid layers: {}",
+          fmt::join(validLayerNames, ", "));
+        computeInstanceCreateInfo.enabledLayerCount =
+          static_cast<uint32_t>(validLayerNames.size());
+        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
+    } else {
+        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
+                    "layer names");
+    }
+#endif
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    vk::DynamicLoader dl;
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    this->mInstance = std::make_shared<vk::Instance>();
+    vk::createInstance(
+      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+
+#if VK_USE_PLATFORM_ANDROID_KHR
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+
+    KP_LOG_DEBUG("Kompute Manager Instance Created");
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
+    if (validLayerNames.size() > 0) {
+        vk::DebugReportFlagsEXT debugFlags =
+          vk::DebugReportFlagBitsEXT::eError |
+          vk::DebugReportFlagBitsEXT::eWarning;
+        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
+        debugCreateInfo.pfnCallback =
+          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
+        debugCreateInfo.flags = debugFlags;
+
+        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugReportCallback =
+          this->mInstance->createDebugReportCallbackEXT(
+            debugCreateInfo, nullptr, this->mDebugDispatcher);
+    }
+#endif
+}
+
+void
+Manager::clear()
+{
+    if (this->mManageResources) {
+        this->mManagedTensors.erase(
+          std::remove_if(begin(this->mManagedTensors),
+                         end(this->mManagedTensors),
+                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
+          end(this->mManagedTensors));
+        this->mManagedAlgorithms.erase(
+          std::remove_if(
+            begin(this->mManagedAlgorithms),
+            end(this->mManagedAlgorithms),
+            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
+          end(this->mManagedAlgorithms));
+        this->mManagedSequences.erase(
+          std::remove_if(begin(this->mManagedSequences),
+                         end(this->mManagedSequences),
+                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
+          end(this->mManagedSequences));
+    }
+}
+
+void
+Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
+                      uint32_t physicalDeviceIndex,
+                      const std::vector<std::string>& desiredExtensions)
+{
+
+    KP_LOG_DEBUG("Kompute Manager creating Device");
+
+    if (this->mInstance == nullptr) {
+        throw std::runtime_error("Kompute Manager instance is null");
+    }
+
+    this->mFreeDevice = true;
+
+    // Getting an integer that says how many vuklan devices we have
+    std::vector<vk::PhysicalDevice> physicalDevices =
+      this->mInstance->enumeratePhysicalDevices();
+    uint32_t deviceCount = physicalDevices.size();
+
+    // This means there are no devices at all
+    if (deviceCount == 0) {
+        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
+                                 "Maybe you haven't installed vulkan drivers?");
+    }
+
+    // This means that we're exceeding our device limit, for
+    // example if we have 2 devices, just physicalDeviceIndex
+    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
+    // always be less than deviceCount, else we raise an error
+    if (!(deviceCount > physicalDeviceIndex)) {
+        throw std::runtime_error("There is no such physical index or device, "
+                                 "please use your existing device");
+    }
+
+    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
+
+    this->mPhysicalDevice =
+      std::make_shared<vk::PhysicalDevice>(physicalDevice);
+
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      physicalDevice.getProperties();
+#endif
+
+    KP_LOG_INFO("Using physical device index {} found {}",
+                physicalDeviceIndex,
+                physicalDeviceProperties.deviceName);
+
+    if (familyQueueIndices.empty()) {
+        // Find compute queue
+        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
+          physicalDevice.getQueueFamilyProperties();
+
+        uint32_t computeQueueFamilyIndex = 0;
+        bool computeQueueSupported = false;
+        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
+            vk::QueueFamilyProperties queueFamilyProperties =
+              allQueueFamilyProperties[i];
+
+            if (queueFamilyProperties.queueFlags &
+                vk::QueueFlagBits::eCompute) {
+                computeQueueFamilyIndex = i;
+                computeQueueSupported = true;
+                break;
+            }
+        }
+
+        if (!computeQueueSupported) {
+            throw std::runtime_error("Compute queue is not supported");
+        }
+
+        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
+    } else {
+        this->mComputeQueueFamilyIndices = familyQueueIndices;
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
+    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
+    for (const auto& value : this->mComputeQueueFamilyIndices) {
+        familyQueueCounts[value]++;
+        familyQueuePriorities[value].push_back(1.0f);
+    }
+
+    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
+    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
+    for (const auto& familyQueueInfo : familyQueueCounts) {
+        // Setting the device count to 0
+        familyQueueIndexCount[familyQueueInfo.first] = 0;
+
+        // Creating the respective device queue
+        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
+          vk::DeviceQueueCreateFlags(),
+          familyQueueInfo.first,
+          familyQueueInfo.second,
+          familyQueuePriorities[familyQueueInfo.first].data());
+        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
+                 fmt::join(desiredExtensions, ", "));
+
+    std::vector<vk::ExtensionProperties> deviceExtensions =
+      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
+
+    std::set<std::string> uniqueExtensionNames;
+    for (const vk::ExtensionProperties& ext : deviceExtensions) {
+        uniqueExtensionNames.insert(ext.extensionName);
+    }
+    KP_LOG_DEBUG("Kompute Manager available extensions {}",
+                 fmt::join(uniqueExtensionNames, ", "));
+    std::vector<const char*> validExtensions;
+    for (const std::string& ext : desiredExtensions) {
+        if (uniqueExtensionNames.count(ext) != 0) {
+            validExtensions.push_back(ext.c_str());
+        }
+    }
+    if (desiredExtensions.size() != validExtensions.size()) {
+        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
+                     fmt::join(validExtensions, ", "));
+    }
+
+    vk::PhysicalDeviceFeatures features;
+    features.shaderInt16 = true;
+
+    vk::PhysicalDeviceVulkan11Features features11;
+    features11.uniformAndStorageBuffer16BitAccess = true;
+    features11.storageBuffer16BitAccess = true;
+    features11.pNext = nullptr;
+
+    vk::PhysicalDeviceVulkan12Features features12;
+    features12.storageBuffer8BitAccess = true;
+    features12.uniformAndStorageBuffer8BitAccess = true;
+    features12.shaderFloat16 = true;
+    features12.shaderInt8 = true;
+    features12.pNext = &features11;
+
+    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
+                                          deviceQueueCreateInfos.size(),
+                                          deviceQueueCreateInfos.data(),
+                                          {},
+                                          {},
+                                          validExtensions.size(),
+                                          validExtensions.data(),
+                                          &features);
+
+    deviceCreateInfo.pNext = &features12;
+
+    this->mDevice = std::make_shared<vk::Device>();
+    vk::Result r = physicalDevice.createDevice(
+      &deviceCreateInfo, nullptr, this->mDevice.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR("Kompute Manager could not create device");
+    }
+
+    KP_LOG_DEBUG("Kompute Manager device created");
+
+    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
+        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
+
+        this->mDevice->getQueue(familyQueueIndex,
+                                familyQueueIndexCount[familyQueueIndex],
+                                currQueue.get());
+
+        familyQueueIndexCount[familyQueueIndex]++;
+
+        this->mComputeQueues.push_back(currQueue);
+    }
+
+    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+}
+
+std::shared_ptr<Sequence>
+Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
+
+    std::shared_ptr<Sequence> sq{ new kp::Sequence(
+      this->mPhysicalDevice,
+      this->mDevice,
+      this->mComputeQueues[queueIndex],
+      this->mComputeQueueFamilyIndices[queueIndex],
+      totalTimestamps) };
+
+    if (this->mManageResources) {
+        this->mManagedSequences.push_back(sq);
+    }
+
+    return sq;
+}
+
+vk::PhysicalDeviceProperties
+Manager::getDeviceProperties() const
+{
+    return this->mPhysicalDevice->getProperties();
+}
+
+std::vector<vk::PhysicalDevice>
+Manager::listDevices() const
+{
+    return this->mInstance->enumeratePhysicalDevices();
+}
+
+std::shared_ptr<vk::Instance>
+Manager::getVkInstance() const
+{
+    return this->mInstance;
+}
+
+}
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
new file mode 100644
index 000000000..cad334f0c
--- /dev/null
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+OpAlgoDispatch::~OpAlgoDispatch()
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
+
+    if (this->mPushConstantsData) {
+        KP_LOG_DEBUG("Kompute freeing push constants data");
+        free(this->mPushConstantsData);
+    }
+}
+
+void
+OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    for (const std::shared_ptr<Tensor>& tensor :
+         this->mAlgorithm->getTensors()) {
+        tensor->recordPrimaryBufferMemoryBarrier(
+          commandBuffer,
+          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderRead,
+          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader);
+    }
+
+    if (this->mPushConstantsSize) {
+        this->mAlgorithm->setPushConstants(
+          this->mPushConstantsData,
+          this->mPushConstantsSize,
+          this->mPushConstantsDataTypeMemorySize);
+    }
+
+    this->mAlgorithm->recordBindCore(commandBuffer);
+    this->mAlgorithm->recordBindPush(commandBuffer);
+    this->mAlgorithm->recordDispatch(commandBuffer);
+}
+
+void
+OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
+}
+
+void
+OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
new file mode 100644
index 000000000..baaafda0f
--- /dev/null
+++ b/kompute/src/OpBufferSyncDevice.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncDevice.hpp"
+
+namespace kp {
+
+OpBufferSyncDevice::OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
+}
+
+OpBufferSyncDevice::~OpBufferSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
+}
+
+void
+OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
+}
+
+void
+OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
+}
+
+void
+OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
new file mode 100644
index 000000000..63739a351
--- /dev/null
+++ b/kompute/src/OpBufferSyncLocal.cpp
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpBufferSyncLocal.hpp"
+
+namespace kp {
+
+OpBufferSyncLocal::OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size)
+  : mPrimaryBuffer(primaryBuffer)
+  , mStagingBuffer(stagingBuffer)
+  , mSize(size)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
+}
+
+OpBufferSyncLocal::~OpBufferSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
+}
+
+void
+OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
+    vk::BufferCopy copyRegion(0, 0, mSize);
+    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
+}
+
+void
+OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
+}
+
+void
+OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
+}
+
+}
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
new file mode 100644
index 000000000..89d44d85e
--- /dev/null
+++ b/kompute/src/OpMemoryBarrier.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpMemoryBarrier.hpp"
+
+namespace kp {
+
+OpMemoryBarrier::OpMemoryBarrier(
+  const std::vector<std::shared_ptr<Tensor>>& tensors,
+  const vk::AccessFlagBits& srcAccessMask,
+  const vk::AccessFlagBits& dstAccessMask,
+  const vk::PipelineStageFlagBits& srcStageMask,
+  const vk::PipelineStageFlagBits& dstStageMask,
+  bool barrierOnPrimary)
+  : mSrcAccessMask(srcAccessMask)
+  , mDstAccessMask(dstAccessMask)
+  , mSrcStageMask(srcStageMask)
+  , mDstStageMask(dstStageMask)
+  , mBarrierOnPrimary(barrierOnPrimary)
+  , mTensors(tensors)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
+}
+
+OpMemoryBarrier::~OpMemoryBarrier()
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
+}
+
+void
+OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
+
+    // Barrier to ensure the data is finished writing to buffer memory
+    if (this->mBarrierOnPrimary) {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    } else {
+        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
+            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
+                                                     this->mSrcAccessMask,
+                                                     this->mDstAccessMask,
+                                                     this->mSrcStageMask,
+                                                     this->mDstStageMask);
+        }
+    }
+}
+
+void
+OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
+}
+
+void
+OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
+}
+
+}
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
new file mode 100644
index 000000000..e732cc413
--- /dev/null
+++ b/kompute/src/OpTensorCopy.cpp
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorCopy.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
+
+    this->mTensors = tensors;
+
+    if (this->mTensors.size() < 2) {
+        throw std::runtime_error(
+          "Kompute OpTensorCopy called with less than 2 tensor");
+    }
+
+    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
+    uint32_t size = this->mTensors[0]->size();
+    for (const std::shared_ptr<Tensor>& tensor : tensors) {
+        if (tensor->dataType() != dataType) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different types from {} to {}",
+              Tensor::toString(dataType),
+              Tensor::toString(tensor->dataType())));
+        }
+        if (tensor->size() != size) {
+            throw std::runtime_error(fmt::format(
+              "Attempting to copy tensors of different sizes from {} to {}",
+              size,
+              tensor->size()));
+        }
+    }
+}
+
+OpTensorCopy::~OpTensorCopy()
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
+}
+
+void
+OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
+
+    // We iterate from the second tensor onwards and record a copy to all
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
+    }
+}
+
+void
+OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
+}
+
+void
+OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
+
+    // Do not copy on CPU side if source is storage tensor
+    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
+    {
+        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
+        return;
+    }
+    void* data = this->mTensors[0]->rawData();
+
+    // Copy the data from the first tensor into all the tensors
+    for (size_t i = 1; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
+            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
+            continue;
+        }
+        this->mTensors[i]->setRawData(data);
+    }
+}
+
+}
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
new file mode 100644
index 000000000..4cc6abf71
--- /dev/null
+++ b/kompute/src/OpTensorSyncDevice.cpp
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorSyncDevice.hpp"
+
+namespace kp {
+
+OpTensorSyncDevice::OpTensorSyncDevice(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+  : mPrimaryBuffer(nullptr)
+  , mStagingBuffer(nullptr)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncDevice called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncDevice::~OpTensorSyncDevice()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
+
+    this->mTensors.clear();
+}
+
+void
+OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
+        }
+    }
+}
+
+void
+OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
+}
+
+void
+OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
+}
+
+}
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
new file mode 100644
index 000000000..1aa091b73
--- /dev/null
+++ b/kompute/src/OpTensorSyncLocal.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpTensorSyncLocal.hpp"
+
+namespace kp {
+
+OpTensorSyncLocal::OpTensorSyncLocal(
+  const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorSyncLocal called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorSyncLocal::~OpTensorSyncLocal()
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
+}
+
+void
+OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eShaderWrite,
+              vk::AccessFlagBits::eTransferRead,
+              vk::PipelineStageFlagBits::eComputeShader,
+              vk::PipelineStageFlagBits::eTransfer);
+
+            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
+
+            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
+              commandBuffer,
+              vk::AccessFlagBits::eTransferWrite,
+              vk::AccessFlagBits::eHostRead,
+              vk::PipelineStageFlagBits::eTransfer,
+              vk::PipelineStageFlagBits::eHost);
+        }
+    }
+}
+
+void
+OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
+}
+
+void
+OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
+
+    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
+}
+
+}
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
new file mode 100644
index 000000000..3b5fb5fb5
--- /dev/null
+++ b/kompute/src/Sequence.cpp
@@ -0,0 +1,396 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Sequence.hpp"
+
+namespace kp {
+
+Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+                   std::shared_ptr<vk::Device> device,
+                   std::shared_ptr<vk::Queue> computeQueue,
+                   uint32_t queueIndex,
+                   uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mComputeQueue = computeQueue;
+    this->mQueueIndex = queueIndex;
+
+    this->createCommandPool();
+    this->createCommandBuffer();
+    if (totalTimestamps > 0)
+        this->createTimestampQueryPool(totalTimestamps +
+                                       1); //+1 for the first one
+}
+
+Sequence::~Sequence()
+{
+    KP_LOG_DEBUG("Kompute Sequence Destructor started");
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+}
+
+void
+Sequence::begin()
+{
+    KP_LOG_DEBUG("Kompute sequence called BEGIN");
+
+    if (this->isRecording()) {
+        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
+        return;
+    }
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    KP_LOG_INFO("Kompute Sequence command now started recording");
+    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
+    this->mRecording = true;
+
+    // latch the first timestamp before any commands are submitted
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          0);
+}
+
+void
+Sequence::end()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling END");
+
+    if (this->isRunning()) {
+        throw std::runtime_error(
+          "Kompute Sequence begin called when sequence still running");
+    }
+
+    if (!this->isRecording()) {
+        KP_LOG_WARN("Kompute Sequence end called when not recording");
+        return;
+    } else {
+        KP_LOG_INFO("Kompute Sequence command recording END");
+        this->mCommandBuffer->end();
+        this->mRecording = false;
+    }
+}
+
+void
+Sequence::clear()
+{
+    KP_LOG_DEBUG("Kompute Sequence calling clear");
+    if (this->isRecording()) {
+        this->end();
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval()
+{
+    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
+
+    return this->evalAsync()->evalAwait();
+}
+
+std::shared_ptr<Sequence>
+Sequence::eval(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    return this->record(op)->eval();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync()
+{
+    if (this->isRecording()) {
+        this->end();
+    }
+
+    if (this->mIsRunning) {
+        throw std::runtime_error(
+          "Kompute Sequence evalAsync called when an eval async was "
+          "called without successful wait");
+    }
+
+    this->mIsRunning = true;
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->preEval(*this->mCommandBuffer);
+    }
+
+    vk::SubmitInfo submitInfo(
+      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
+
+    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
+
+    KP_LOG_DEBUG(
+      "Kompute sequence submitting command buffer into compute queue");
+
+    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
+
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAsync(std::shared_ptr<OpBase> op)
+{
+    this->clear();
+    this->record(op);
+    this->evalAsync();
+    return shared_from_this();
+}
+
+std::shared_ptr<Sequence>
+Sequence::evalAwait(uint64_t waitFor)
+{
+    if (!this->mIsRunning) {
+        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
+        return shared_from_this();
+    }
+
+    vk::Result result =
+      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
+    this->mDevice->destroy(
+      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+    this->mIsRunning = false;
+
+    if (result == vk::Result::eTimeout) {
+        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
+                    waitFor);
+        return shared_from_this();
+    }
+
+    for (size_t i = 0; i < this->mOperations.size(); i++) {
+        this->mOperations[i]->postEval(*this->mCommandBuffer);
+    }
+
+    return shared_from_this();
+}
+
+bool
+Sequence::isRunning() const
+{
+    return this->mIsRunning;
+}
+
+bool
+Sequence::isRecording() const
+{
+    return this->mRecording;
+}
+
+bool
+Sequence::isInit() const
+{
+    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
+           this->mComputeQueue;
+}
+
+void
+Sequence::rerecord()
+{
+    this->end();
+    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
+    this->mOperations.clear();
+    for (const std::shared_ptr<kp::OpBase>& op : ops) {
+        this->record(op);
+    }
+}
+
+void
+Sequence::destroy()
+{
+    KP_LOG_DEBUG("Kompute Sequence destroy called");
+
+    if (!this->mDevice) {
+        KP_LOG_WARN("Kompute Sequence destroy called "
+                    "with null Device pointer");
+        return;
+    }
+
+    if (this->mFreeCommandBuffer) {
+        KP_LOG_INFO("Freeing CommandBuffer");
+        if (!this->mCommandBuffer) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->freeCommandBuffers(
+          *this->mCommandPool, 1, this->mCommandBuffer.get());
+
+        this->mCommandBuffer = nullptr;
+        this->mFreeCommandBuffer = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
+    }
+
+    if (this->mFreeCommandPool) {
+        KP_LOG_INFO("Destroying CommandPool");
+        if (this->mCommandPool == nullptr) {
+            KP_LOG_WARN("Kompute Sequence destroy called with null "
+                        "CommandPool pointer");
+            return;
+        }
+        this->mDevice->destroy(
+          *this->mCommandPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->mCommandPool = nullptr;
+        this->mFreeCommandPool = false;
+
+        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
+    }
+
+    if (this->mOperations.size()) {
+        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
+        this->mOperations.clear();
+    }
+
+    if (this->timestampQueryPool) {
+        KP_LOG_INFO("Destroying QueryPool");
+        this->mDevice->destroy(
+          *this->timestampQueryPool,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+
+        this->timestampQueryPool = nullptr;
+        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+    if (this->mPhysicalDevice) {
+        this->mPhysicalDevice = nullptr;
+    }
+    if (this->mComputeQueue) {
+        this->mComputeQueue = nullptr;
+    }
+}
+
+std::shared_ptr<Sequence>
+Sequence::record(std::shared_ptr<OpBase> op)
+{
+    KP_LOG_DEBUG("Kompute Sequence record function started");
+
+    this->begin();
+
+    KP_LOG_DEBUG(
+      "Kompute Sequence running record on OpBase derived class instance");
+
+    op->record(*this->mCommandBuffer);
+
+    this->mOperations.push_back(op);
+
+    if (this->timestampQueryPool)
+        this->mCommandBuffer->writeTimestamp(
+          vk::PipelineStageFlagBits::eAllCommands,
+          *this->timestampQueryPool,
+          this->mOperations.size());
+
+    return shared_from_this();
+}
+
+void
+Sequence::createCommandPool()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command pool");
+
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+
+    this->mFreeCommandPool = true;
+
+    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
+                                              this->mQueueIndex);
+    this->mCommandPool = std::make_shared<vk::CommandPool>();
+    this->mDevice->createCommandPool(
+      &commandPoolInfo, nullptr, this->mCommandPool.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
+}
+
+void
+Sequence::createCommandBuffer()
+{
+    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Sequence device is null");
+    }
+    if (!this->mCommandPool) {
+        throw std::runtime_error("Kompute Sequence command pool is null");
+    }
+
+    this->mFreeCommandBuffer = true;
+
+    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
+      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
+
+    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
+    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
+                                          this->mCommandBuffer.get());
+    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
+}
+
+void
+Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
+{
+    KP_LOG_DEBUG("Kompute Sequence creating query pool");
+    if (!this->isInit()) {
+        throw std::runtime_error(
+          "createTimestampQueryPool() called on uninitialized Sequence");
+    }
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Sequence physical device is null");
+    }
+
+    vk::PhysicalDeviceProperties physicalDeviceProperties =
+      this->mPhysicalDevice->getProperties();
+
+    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
+        vk::QueryPoolCreateInfo queryPoolInfo;
+        queryPoolInfo.setQueryCount(totalTimestamps);
+        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
+        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
+          this->mDevice->createQueryPool(queryPoolInfo));
+
+        KP_LOG_DEBUG("Query pool for timestamps created");
+    } else {
+        throw std::runtime_error("Device does not support timestamps");
+    }
+}
+
+std::vector<std::uint64_t>
+Sequence::getTimestamps()
+{
+    if (!this->timestampQueryPool)
+        throw std::runtime_error("Timestamp latching not enabled");
+
+    const auto n = this->mOperations.size() + 1;
+    std::vector<std::uint64_t> timestamps(n, 0);
+    this->mDevice->getQueryPoolResults(
+      *this->timestampQueryPool,
+      0,
+      n,
+      timestamps.size() * sizeof(std::uint64_t),
+      timestamps.data(),
+      sizeof(uint64_t),
+      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
+
+    return timestamps;
+}
+
+}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
new file mode 100644
index 000000000..9c343ff13
--- /dev/null
+++ b/kompute/src/Tensor.cpp
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+std::string
+Tensor::toString(Tensor::TensorDataTypes dt)
+{
+    switch (dt) {
+        case TensorDataTypes::eBool:
+            return "eBool";
+        case TensorDataTypes::eInt:
+            return "eInt";
+        case TensorDataTypes::eUnsignedInt:
+            return "eUnsignedInt";
+        case TensorDataTypes::eFloat:
+            return "eFloat";
+        case TensorDataTypes::eDouble:
+            return "eDouble";
+        default:
+            return "unknown";
+    }
+}
+
+std::string
+Tensor::toString(Tensor::TensorTypes dt)
+{
+    switch (dt) {
+        case TensorTypes::eDevice:
+            return "eDevice";
+        case TensorTypes::eHost:
+            return "eHost";
+        case TensorTypes::eStorage:
+            return "eStorage";
+        default:
+            return "unknown";
+    }
+}
+
+Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+               std::shared_ptr<vk::Device> device,
+               void* data,
+               uint32_t elementTotalCount,
+               uint32_t elementMemorySize,
+               const TensorDataTypes& dataType,
+               vk::DeviceMemory *primaryMemory,
+               vk::Buffer *primaryBuffer,
+               vk::DeviceMemory *stagingMemory,
+               vk::Buffer *stagingBuffer,
+               vk::DeviceSize offset,
+               const TensorTypes& tensorType)
+{
+    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
+                 elementTotalCount,
+                 Tensor::toString(tensorType));
+
+    this->mPhysicalDevice = physicalDevice;
+    this->mDevice = device;
+    this->mDataType = dataType;
+    this->mTensorType = tensorType;
+
+    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::~Tensor()
+{
+    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
+                 Tensor::toString(this->tensorType()));
+
+    if (this->mDevice) {
+        this->destroy();
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor destructor success");
+}
+
+void
+Tensor::rebuild(void* /*data*/,
+                uint32_t elementTotalCount,
+                uint64_t memorySize,
+                vk::DeviceMemory *primaryMemory,
+                vk::Buffer *primaryBuffer,
+                vk::DeviceMemory *stagingMemory,
+                vk::Buffer *stagingBuffer,
+                vk::DeviceSize offset)
+{
+    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
+
+    this->mSize = elementTotalCount;
+    this->mMemorySize = memorySize;
+    this->mOffset = offset;
+
+    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
+        KP_LOG_DEBUG(
+          "Kompute Tensor destroying existing resources before rebuild");
+        this->destroy();
+    }
+
+    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
+}
+
+Tensor::TensorTypes
+Tensor::tensorType()
+{
+    return this->mTensorType;
+}
+
+bool
+Tensor::isInit()
+{
+    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
+           this->mRawData;
+}
+
+uint32_t
+Tensor::size()
+{
+    return this->mSize;
+}
+
+uint64_t
+Tensor::memorySize()
+{
+    return this->mMemorySize;
+}
+
+kp::Tensor::TensorDataTypes
+Tensor::dataType()
+{
+    return this->mDataType;
+}
+
+void*
+Tensor::rawData()
+{
+    return this->mRawData;
+}
+
+void
+Tensor::setRawData(const void* data)
+{
+    memcpy(this->mRawData, data, this->memorySize());
+}
+
+void
+Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                       std::shared_ptr<Tensor> copyFromTensor)
+{
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           copyFromTensor->mPrimaryBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mStagingBuffer,
+                           this->mPrimaryBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    vk::DeviceSize bufferSize(this->memorySize());
+    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
+
+    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
+
+    this->recordCopyBuffer(commandBuffer,
+                           this->mPrimaryBuffer,
+                           this->mStagingBuffer,
+                           bufferSize,
+                           copyRegion);
+}
+
+void
+Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                         vk::Buffer *bufferFrom,
+                         vk::Buffer *bufferTo,
+                         vk::DeviceSize /*bufferSize*/,
+                         vk::BufferCopy copyRegion)
+{
+
+    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
+}
+
+void
+Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mPrimaryBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                         vk::AccessFlagBits srcAccessMask,
+                                         vk::AccessFlagBits dstAccessMask,
+                                         vk::PipelineStageFlagBits srcStageMask,
+                                         vk::PipelineStageFlagBits dstStageMask)
+{
+    if (!this->mStagingBuffer)
+        return;
+
+    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
+
+    this->recordBufferMemoryBarrier(commandBuffer,
+                                    *this->mStagingBuffer,
+                                    srcAccessMask,
+                                    dstAccessMask,
+                                    srcStageMask,
+                                    dstStageMask);
+}
+
+void
+Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                  const vk::Buffer& buffer,
+                                  vk::AccessFlagBits srcAccessMask,
+                                  vk::AccessFlagBits dstAccessMask,
+                                  vk::PipelineStageFlagBits srcStageMask,
+                                  vk::PipelineStageFlagBits dstStageMask)
+{
+    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
+
+    vk::DeviceSize bufferSize = this->memorySize();
+
+    vk::BufferMemoryBarrier bufferMemoryBarrier;
+    bufferMemoryBarrier.buffer = buffer;
+    bufferMemoryBarrier.size = bufferSize;
+    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
+    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
+    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+
+    commandBuffer.pipelineBarrier(srcStageMask,
+                                  dstStageMask,
+                                  vk::DependencyFlags(),
+                                  nullptr,
+                                  bufferMemoryBarrier,
+                                  nullptr);
+}
+
+vk::DescriptorBufferInfo
+Tensor::constructDescriptorBufferInfo()
+{
+    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
+                 this->memorySize());
+    vk::DeviceSize bufferSize = this->memorySize();
+    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
+                                    mOffset, // offset
+                                    bufferSize);
+}
+
+vk::BufferUsageFlags
+Tensor::getPrimaryBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eHost:
+            return vk::BufferUsageFlagBits::eStorageBuffer |
+                   vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        case TensorTypes::eStorage:
+            return vk::BufferUsageFlagBits::eStorageBuffer;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getPrimaryMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        case TensorTypes::eHost:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        case TensorTypes::eStorage:
+            return vk::MemoryPropertyFlagBits::eDeviceLocal;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::BufferUsageFlags
+Tensor::getStagingBufferUsageFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::BufferUsageFlagBits::eTransferSrc |
+                   vk::BufferUsageFlagBits::eTransferDst;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+vk::MemoryPropertyFlags
+Tensor::getStagingMemoryPropertyFlags()
+{
+    switch (this->mTensorType) {
+        case TensorTypes::eDevice:
+            return vk::MemoryPropertyFlagBits::eHostVisible |
+                   vk::MemoryPropertyFlagBits::eHostCoherent;
+            break;
+        default:
+            throw std::runtime_error("Kompute Tensor invalid tensor type");
+    }
+}
+
+void
+Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
+                        vk::Buffer *primaryBuffer,
+                        vk::DeviceMemory *stagingMemory,
+                        vk::Buffer *stagingBuffer,
+                        vk::DeviceSize /*offset*/)
+{
+    KP_LOG_DEBUG("Kompute Tensor creating buffer");
+
+    if (!this->mPhysicalDevice) {
+        throw std::runtime_error("Kompute Tensor phyisical device is null");
+    }
+    if (!this->mDevice) {
+        throw std::runtime_error("Kompute Tensor device is null");
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
+
+    this->mPrimaryBuffer = primaryBuffer;
+    this->mPrimaryMemory = primaryMemory;
+
+    if (this->mTensorType == TensorTypes::eDevice) {
+        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
+
+        this->mStagingBuffer = stagingBuffer;
+        this->mStagingMemory = stagingMemory;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
+}
+
+void
+Tensor::destroy()
+{
+    KP_LOG_DEBUG("Kompute Tensor started destroy()");
+
+    // Setting raw data to null regardless whether device is available to
+    // invalidate Tensor
+    this->mRawData = nullptr;
+    this->mSize = 0;
+    this->mMemorySize = 0;
+
+    if (!this->mDevice) {
+        KP_LOG_WARN(
+          "Kompute Tensor destructor reached with null Device pointer");
+        return;
+    }
+
+    if (this->mDevice) {
+        this->mDevice = nullptr;
+    }
+
+    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<bool>::dataType()
+{
+    return Tensor::TensorDataTypes::eBool;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<int32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<uint32_t>::dataType()
+{
+    return Tensor::TensorDataTypes::eUnsignedInt;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<float>::dataType()
+{
+    return Tensor::TensorDataTypes::eFloat;
+}
+
+template<>
+Tensor::TensorDataTypes
+TensorT<double>::dataType()
+{
+    return Tensor::TensorDataTypes::eDouble;
+}
+
+}
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
new file mode 100644
index 000000000..05e1ed5e1
--- /dev/null
+++ b/kompute/src/include/CMakeLists.txt
@@ -0,0 +1,46 @@
+cmake_minimum_required(VERSION 3.20)
+
+# ####################################################
+# Kompute
+# ####################################################
+target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kompute PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/Algorithm.hpp
+    kompute/Core.hpp
+    kompute/Kompute.hpp
+    kompute/Manager.hpp
+    kompute/Sequence.hpp
+    kompute/Tensor.hpp
+
+    kompute/operations/OpAlgoDispatch.hpp
+    kompute/operations/OpBase.hpp
+    kompute/operations/OpMemoryBarrier.hpp
+    kompute/operations/OpMult.hpp
+    kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorSyncDevice.hpp
+    kompute/operations/OpTensorSyncLocal.hpp
+    kompute/operations/OpBufferSyncDevice.hpp
+    kompute/operations/OpBufferSyncLocal.hpp
+
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+# ####################################################
+# Logger
+# ####################################################
+target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
+
+target_sources(kp_logger PRIVATE
+
+    # Header files (useful in IDEs)
+    kompute/logger/Logger.hpp
+)
+
+install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
new file mode 100644
index 000000000..90fe48fef
--- /dev/null
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -0,0 +1,338 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "fmt/format.h"
+#include "kompute/Tensor.hpp"
+#include "logger/Logger.hpp"
+
+namespace kp {
+
+/**
+    Abstraction for compute shaders that are run on top of tensors grouped via
+   ParameterGroups (which group descriptorsets)
+*/
+class Algorithm
+{
+  public:
+    /**
+     *  Main constructor for algorithm with configuration parameters to create
+     *  the underlying resources.
+     *
+     *  @param device The Vulkan device to use for creating resources
+     *  @param tensors (optional) The tensors to use to create the descriptor
+     * resources
+     *  @param spirv (optional) The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The templatable param is to be
+     * used to initialize the specialization constants which cannot be changed
+     * once set.
+     *  @param pushConstants (optional) This templatable param is to be used
+     * when initializing the pipeline, which set the size of the push constants
+     * - these can be modified but all new values must have the same data type
+     * and length as otherwise it will result in errors.
+     */
+    template<typename S = float, typename P = float>
+    Algorithm(std::shared_ptr<vk::Device> device,
+              vk::DescriptorPool *pool,
+              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+              const std::vector<uint32_t>& spirv = {},
+              const Workgroup& workgroup = {},
+              const std::vector<S>& specializationConstants = {},
+              const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
+
+        this->mDevice = device;
+        this->mDescriptorPool = pool;
+
+        if (tensors.size() && spirv.size()) {
+            KP_LOG_INFO(
+              "Kompute Algorithm initialising with tensor size: {} and "
+              "spirv size: {}",
+              tensors.size(),
+              spirv.size());
+            this->rebuild(tensors,
+                          spirv,
+                          workgroup,
+                          specializationConstants,
+                          pushConstants);
+        } else {
+            KP_LOG_INFO(
+              "Kompute Algorithm constructor with empty tensors and or "
+              "spirv so not rebuilding vulkan components");
+        }
+    }
+
+    /**
+     *  Rebuild function to reconstruct algorithm with configuration parameters
+     * to create the underlying resources.
+     *
+     *  @param tensors The tensors to use to create the descriptor resources
+     *  @param spirv The spirv code to use to create the algorithm
+     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
+     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
+     *  @param specializationConstants (optional) The std::vector<float> to use
+     * to initialize the specialization constants which cannot be changed once
+     * set.
+     *  @param pushConstants (optional) The std::vector<float> to use when
+     * initializing the pipeline, which set the size of the push constants -
+     * these can be modified but all new values must have the same vector size
+     * as this initial value.
+     */
+    template<typename S = float, typename P = float>
+    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                 const std::vector<uint32_t>& spirv,
+                 const Workgroup& workgroup = {},
+                 const std::vector<S>& specializationConstants = {},
+                 const std::vector<P>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
+
+        this->mTensors = tensors;
+        this->mSpirv = spirv;
+
+        if (specializationConstants.size()) {
+            if (this->mSpecializationConstantsData) {
+                free(this->mSpecializationConstantsData);
+            }
+            uint32_t memorySize =
+              sizeof(decltype(specializationConstants.back()));
+            uint32_t size = specializationConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mSpecializationConstantsData = malloc(totalSize);
+            memcpy(this->mSpecializationConstantsData,
+                   specializationConstants.data(),
+                   totalSize);
+            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
+            this->mSpecializationConstantsSize = size;
+        }
+
+        if (pushConstants.size()) {
+            if (this->mPushConstantsData) {
+                free(this->mPushConstantsData);
+            }
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+
+        this->setWorkgroup(
+          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        // Descriptor pool is created first so if available then destroy all
+        // before rebuild
+        if (this->isInit()) {
+            this->destroy();
+        }
+
+        this->createParameters();
+        this->createShaderModule();
+        this->createPipeline();
+    }
+
+    /**
+     * Destructor for Algorithm which is responsible for freeing and desroying
+     * respective pipelines and owned parameter groups.
+     */
+    ~Algorithm();
+
+    /**
+     * Records the dispatch function with the provided template parameters or
+     * alternatively using the size of the tensor by default.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordDispatch(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the "core" algorithm components which consist
+     * of binding the pipeline and binding the descriptorsets.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindCore(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records command that binds the push constants to the command buffer
+     * provided
+     * - it is required that the pushConstants provided are of the same size as
+     * the ones provided during initialization.
+     *
+     * @param commandBuffer Command buffer to record the algorithm resources to
+     */
+    void recordBindPush(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * function that checks all the gpu resource components to verify if these
+     * have been created and returns true if all are valid.
+     *
+     * @returns returns true if the algorithm is currently initialized.
+     */
+    bool isInit();
+
+    /**
+     * Sets the work group to use in the recordDispatch
+     *
+     * @param workgroup The kp::Workgroup value to use to update the algorithm.
+     * It must have a value greater than 1 on the x value (index 1) otherwise it
+     * will be initialized on the size of the first tensor (ie.
+     * this->mTensor[0]->size())
+     */
+    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush()
+     *
+     * @param pushConstants The templatable vector is to be used to set the push
+     * constants to use in the next bindPush(...) calls. The constants provided
+     * must be of the same size as the ones created during initialization.
+     */
+    template<typename T>
+    void setPushConstants(const std::vector<T>& pushConstants)
+    {
+        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+        uint32_t size = pushConstants.size();
+        this->setPushConstants(pushConstants.data(), size, memorySize);
+    }
+
+    void updateDescriptors(vk::DescriptorPool *pool)
+    {
+        this->mDescriptorPool = pool;
+        this->setWorkgroup(
+          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
+
+        this->updateParameters(); // TODO: See if we can reduce this
+    }
+
+    /**
+     * Sets the push constants to the new value provided to use in the next
+     * bindPush() with the raw memory block location and memory size to be used.
+     *
+     * @param data The raw data point to copy the data from, without modifying
+     * the pointer.
+     * @param size The number of data elements provided in the data
+     * @param memorySize The memory size of each of the data elements in bytes.
+     */
+    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
+    {
+
+        uint32_t totalSize = memorySize * size;
+        uint32_t previousTotalSize =
+          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
+
+        if (totalSize != previousTotalSize) {
+            throw std::runtime_error(fmt::format(
+              "Kompute Algorithm push "
+              "constant total memory size provided is {} but expected {} bytes",
+              totalSize,
+              previousTotalSize));
+        }
+        if (this->mPushConstantsData) {
+            free(this->mPushConstantsData);
+        }
+
+        this->mPushConstantsData = malloc(totalSize);
+        memcpy(this->mPushConstantsData, data, totalSize);
+        this->mPushConstantsDataTypeMemorySize = memorySize;
+        this->mPushConstantsSize = size;
+    }
+
+    /**
+     * Gets the current workgroup from the algorithm.
+     *
+     * @param The kp::Constant to use to set the push constants to use in the
+     * next bindPush(...) calls. The constants provided must be of the same size
+     * as the ones created during initialization.
+     */
+    const Workgroup& getWorkgroup();
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for specialization
+     * constants
+     */
+    template<typename T>
+    const std::vector<T> getSpecializationConstants()
+    {
+        return { (T*)this->mSpecializationConstantsData,
+                 ((T*)this->mSpecializationConstantsData) +
+                   this->mSpecializationConstantsSize };
+    }
+    /**
+     * Gets the specialization constants of the current algorithm.
+     *
+     * @returns The std::vector<float> currently set for push constants
+     */
+    template<typename T>
+    const std::vector<T> getPushConstants()
+    {
+        return { (T*)this->mPushConstantsData,
+                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
+    }
+    /**
+     * Gets the current tensors that are used in the algorithm.
+     *
+     * @returns The list of tensors used in the algorithm.
+     */
+    const std::vector<std::shared_ptr<Tensor>>& getTensors();
+    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::Device> mDevice;
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
+    bool mFreeDescriptorSetLayout = false;
+    vk::DescriptorPool *mDescriptorPool = nullptr;
+    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
+    bool mFreeDescriptorSet = false;
+    std::shared_ptr<vk::ShaderModule> mShaderModule;
+    bool mFreeShaderModule = false;
+    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
+    bool mFreePipelineLayout = false;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
+    bool mFreePipelineCache = false;
+    std::shared_ptr<vk::Pipeline> mPipeline;
+    bool mFreePipeline = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<uint32_t> mSpirv;
+    void* mSpecializationConstantsData = nullptr;
+    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
+    uint32_t mSpecializationConstantsSize = 0;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+    Workgroup mWorkgroup;
+
+    // Create util functions
+    void createShaderModule();
+    void createPipeline();
+
+    // Parameters
+    void freeParameters();
+    void createParameters();
+    void updateParameters();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
new file mode 100644
index 000000000..99222cbde
--- /dev/null
+++ b/kompute/src/include/kompute/Core.hpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <vulkan/vulkan.hpp>
+
+// Typedefs to simplify interaction with core types
+namespace kp {
+typedef std::array<uint32_t, 3> Workgroup;
+typedef std::vector<float> Constants;
+}
+
+// Must be after vulkan is included
+#ifndef KOMPUTE_VK_API_VERSION
+#ifndef KOMPUTE_VK_API_MAJOR_VERSION
+#define KOMPUTE_VK_API_MAJOR_VERSION 1
+#endif // KOMPUTE_VK_API_MAJOR_VERSION
+#ifndef KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_MINOR_VERSION 2
+#endif // KOMPUTE_VK_API_MINOR_VERSION
+#define KOMPUTE_VK_API_VERSION                                                 \
+    VK_MAKE_VERSION(                                                           \
+      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
+#endif // KOMPUTE_VK_API_VERSION
+
+#if defined(KOMPUTE_BUILD_PYTHON)
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#endif
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
new file mode 100644
index 000000000..f59a63b50
--- /dev/null
+++ b/kompute/src/include/kompute/Kompute.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "Algorithm.hpp"
+#include "Core.hpp"
+#include "Manager.hpp"
+#include "Sequence.hpp"
+#include "Tensor.hpp"
+
+#include "operations/OpAlgoDispatch.hpp"
+#include "operations/OpBase.hpp"
+#include "operations/OpMemoryBarrier.hpp"
+#include "operations/OpMult.hpp"
+#include "operations/OpTensorCopy.hpp"
+#include "operations/OpTensorSyncDevice.hpp"
+#include "operations/OpTensorSyncLocal.hpp"
+#include "operations/OpBufferSyncDevice.hpp"
+#include "operations/OpBufferSyncLocal.hpp"
+
+// Will be build by CMake and placed inside the build directory
+#include "ShaderLogisticRegression.hpp"
+#include "ShaderOpMult.hpp"
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
new file mode 100644
index 000000000..8fda58f84
--- /dev/null
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include <set>
+#include <unordered_map>
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Sequence.hpp"
+#include "logger/Logger.hpp"
+
+#define KP_DEFAULT_SESSION "DEFAULT"
+
+namespace kp {
+
+/**
+    Base orchestrator which creates and manages device and child components
+*/
+class Manager
+{
+  public:
+    /**
+        Base constructor.
+    */
+    Manager();
+
+    /**
+     * Manager destructor which would ensure all owned resources are destroyed
+     * unless explicitly stated that resources should not be destroyed or freed.
+     */
+    ~Manager();
+
+    bool hasDevice() const {
+        return this->mDevice.get();
+    }
+
+    /**
+     * Initialize a device.
+     *
+     * @param physicalDeviceIndex The index of the physical device to use
+     * @param familyQueueIndices (Optional) List of queue indices to add for
+     * explicit allocation
+     * @param desiredExtensions The desired extensions to load from
+     * physicalDevice
+     */
+    void initializeDevice(uint32_t physicalDeviceIndex,
+            const std::vector<uint32_t>& familyQueueIndices = {},
+            const std::vector<std::string>& desiredExtensions = {});
+
+    /**
+     * Create a managed sequence that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param queueIndex The queue to use from the available queues
+     * @param nrOfTimestamps The maximum number of timestamps to allocate.
+     * If zero (default), disables latching of timestamps.
+     * @returns Shared pointer with initialised sequence
+     */
+    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
+                                       uint32_t totalTimestamps = 0);
+
+    /**
+     * Create a managed tensor that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param data The data to initialize the tensor with
+     * @param tensorType The type of tensor to initialize
+     * @returns Shared pointer with initialised tensor
+     */
+    template<typename T>
+    std::shared_ptr<TensorT<T>> tensorT(
+      const std::vector<T>& data,
+       vk::DeviceMemory *primaryMemory,
+       vk::Buffer *primaryBuffer,
+       vk::DeviceMemory *stagingMemory,
+       vk::Buffer *stagingBuffer,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
+
+        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
+          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    std::shared_ptr<Tensor> tensor(
+      void* data,
+      uint32_t elementTotalCount,
+      uint64_t memorySize,
+      const Tensor::TensorDataTypes& dataType,
+      vk::DeviceMemory *primaryMemory,
+      vk::Buffer *primaryBuffer,
+      vk::DeviceMemory *stagingMemory,
+      vk::Buffer *stagingBuffer,
+      vk::DeviceSize offset,
+      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
+    {
+        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
+                                                       this->mDevice,
+                                                       data,
+                                                       elementTotalCount,
+                                                       memorySize,
+                                                       dataType,
+                                                       primaryMemory,
+                                                       primaryBuffer,
+                                                       stagingMemory,
+                                                       stagingBuffer,
+                                                       offset,
+                                                       tensorType) };
+
+        if (this->mManageResources) {
+            this->mManagedTensors.push_back(tensor);
+        }
+
+        return tensor;
+    }
+
+    /**
+     * Default non-template function that can be used to create algorithm
+     * objects which provides default types to the push and spec constants as
+     * floats.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) float vector to use for
+     * specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) float vector to use for push constants,
+     * and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
+      const std::vector<uint32_t>& spirv = {},
+      const Workgroup& workgroup = {},
+      const std::vector<float>& specializationConstants = {},
+      const std::vector<float>& pushConstants = {})
+    {
+        return this->algorithm<>(
+          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+    }
+
+    /**
+     * Create a managed algorithm that will be destroyed by this manager
+     * if it hasn't been destroyed by its reference count going to zero.
+     *
+     * @param tensors (optional) The tensors to initialise the algorithm with
+     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
+     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
+     * defaults to (tensor[0].size(), 1, 1)
+     * @param specializationConstants (optional) templatable vector parameter to
+     * use for specialization constants, and defaults to an empty constant
+     * @param pushConstants (optional) templatable vector parameter to use for
+     * push constants, and defaults to an empty constant
+     * @returns Shared pointer with initialised algorithm
+     */
+    template<typename S = float, typename P = float>
+    std::shared_ptr<Algorithm> algorithm(
+      vk::DescriptorPool *pool,
+      const std::vector<std::shared_ptr<Tensor>>& tensors,
+      const std::vector<uint32_t>& spirv,
+      const Workgroup& workgroup,
+      const std::vector<S>& specializationConstants,
+      const std::vector<P>& pushConstants)
+    {
+
+        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
+
+        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
+          this->mDevice,
+          pool,
+          tensors,
+          spirv,
+          workgroup,
+          specializationConstants,
+          pushConstants) };
+
+        if (this->mManageResources) {
+            this->mManagedAlgorithms.push_back(algorithm);
+        }
+
+        return algorithm;
+    }
+
+    /**
+     * Destroy the GPU resources and all managed resources by manager.
+     **/
+    void destroy();
+    /**
+     * Run a pseudo-garbage collection to release all the managed resources
+     * that have been already freed due to these reaching to zero ref count.
+     **/
+    void clear();
+
+    /**
+     * Information about the current device.
+     *
+     * @return vk::PhysicalDeviceProperties containing information about the
+     *device
+     **/
+    vk::PhysicalDeviceProperties getDeviceProperties() const;
+
+    /**
+     * List the devices available in the current vulkan instance.
+     *
+     * @return vector of physical devices containing their respective properties
+     **/
+    std::vector<vk::PhysicalDevice> listDevices() const;
+
+    /**
+     * The current Vulkan instance.
+     *
+     * @return a shared pointer to the current Vulkan instance held by this
+     *object
+     **/
+    std::shared_ptr<vk::Instance> getVkInstance() const;
+
+    std::shared_ptr<vk::Device> device() const { return mDevice; }
+    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+
+  private:
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::Instance> mInstance = nullptr;
+    bool mFreeInstance = false;
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    bool mFreeDevice = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
+    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
+    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+
+    std::vector<uint32_t> mComputeQueueFamilyIndices;
+    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+
+    bool mManageResources = false;
+
+#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
+    vk::DebugReportCallbackEXT mDebugReportCallback;
+    vk::DispatchLoaderDynamic mDebugDispatcher;
+#endif
+
+    // Create functions
+    void createInstance();
+    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
+                      uint32_t physicalDeviceIndex = 0,
+                      const std::vector<std::string>& desiredExtensions = {});
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
new file mode 100644
index 000000000..e282242f1
--- /dev/null
+++ b/kompute/src/include/kompute/Sequence.hpp
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ *  Container of operations that can be sent to GPU as batch
+ */
+class Sequence : public std::enable_shared_from_this<Sequence>
+{
+  public:
+    /**
+     * Main constructor for sequence which requires core vulkan components to
+     * generate all dependent resources.
+     *
+     * @param physicalDevice Vulkan physical device
+     * @param device Vulkan logical device
+     * @param computeQueue Vulkan compute queue
+     * @param queueIndex Vulkan compute queue index in device
+     * @param totalTimestamps Maximum number of timestamps to allocate
+     */
+    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+             std::shared_ptr<vk::Device> device,
+             std::shared_ptr<vk::Queue> computeQueue,
+             uint32_t queueIndex,
+             uint32_t totalTimestamps = 0);
+    /**
+     * Destructor for sequence which is responsible for cleaning all subsequent
+     * owned operations.
+     */
+    ~Sequence();
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param op Object derived from kp::BaseOp that will be recoreded by the
+     * sequence which will be used when the operation is evaluated.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
+
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+    /**
+     * Record function for operation to be added to the GPU queue in batch. This
+     * template requires classes to be derived from the OpBase class. This
+     * function also requires the Sequence to be recording, otherwise it will
+     * not be able to add the operation.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
+                                     TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->record(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job synchronously (with a barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval();
+
+    /**
+     * Resets all the recorded and stored operations, records the operation
+     * provided and submits into the gpu as a submit job synchronously (with a
+     * barrier).
+     *
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
+                                   vk::Buffer *stagingBuffer,
+                                   vk::DeviceSize size,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
+                                   TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->eval(op);
+    }
+
+    /**
+     * Eval Async sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync();
+    /**
+     * Clears currnet operations to record provided one in the vector of
+     * operations into the gpu as a submit job without a barrier. EvalAwait()
+     * must ALWAYS be called after to ensure the sequence is terminated
+     * correctly.
+     *
+     * @return Boolean stating whether execution was successful.
+     */
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param tensors Vector of tensors to use for the operation
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(
+      std::vector<std::shared_ptr<Tensor>> tensors,
+      TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+    /**
+     * Eval sends all the recorded and stored operations in the vector of
+     * operations into the gpu as a submit job with a barrier.
+     *
+     * @param algorithm Algorithm to use for the record often used for OpAlgo
+     * operations
+     * @param TArgs Template parameters that are used to initialise operation
+     * which allows for extensible configurations on initialisation.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    template<typename T, typename... TArgs>
+    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
+                                        TArgs&&... params)
+    {
+        std::shared_ptr<T> op{ new T(algorithm,
+                                     std::forward<TArgs>(params)...) };
+        return this->evalAsync(op);
+    }
+
+    /**
+     * Eval Await waits for the fence to finish processing and then once it
+     * finishes, it runs the postEval of all operations.
+     *
+     * @param waitFor Number of milliseconds to wait before timing out.
+     * @return shared_ptr<Sequence> of the Sequence class itself
+     */
+    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
+
+    /**
+     * Clear function clears all operations currently recorded and starts
+     * recording again.
+     */
+    void clear();
+
+    /**
+     * Return the timestamps that were latched at the beginning and
+     * after each operation during the last eval() call.
+     */
+    std::vector<std::uint64_t> getTimestamps();
+
+    /**
+     * Begins recording commands for commands to be submitted into the command
+     * buffer.
+     */
+    void begin();
+
+    /**
+     * Ends the recording and stops recording commands when the record command
+     * is sent.
+     */
+    void end();
+
+    /**
+     * Returns true if the sequence is currently in recording activated.
+     *
+     * @return Boolean stating if recording ongoing.
+     */
+    bool isRecording() const;
+
+    /**
+     * Returns true if the sequence has been initialised, and it's based on the
+     * GPU resources being referenced.
+     *
+     * @return Boolean stating if is initialized
+     */
+    bool isInit() const;
+
+    /**
+     * Clears command buffer and triggers re-record of all the current
+     * operations saved, which is useful if the underlying kp::Tensors or
+     * kp::Algorithms are modified and need to be re-recorded.
+     */
+    void rerecord();
+
+    /**
+     * Returns true if the sequence is currently running - mostly used for async
+     * workloads.
+     *
+     * @return Boolean stating if currently running.
+     */
+    bool isRunning() const;
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory
+     * and sets the sequence as init=False.
+     */
+    void destroy();
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
+    std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
+    uint32_t mQueueIndex = -1;
+
+    // -------------- OPTIONALLY OWNED RESOURCES
+    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
+    bool mFreeCommandPool = false;
+    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
+    bool mFreeCommandBuffer = false;
+
+    // -------------- ALWAYS OWNED RESOURCES
+    vk::Fence mFence;
+    std::vector<std::shared_ptr<OpBase>> mOperations{};
+    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
+
+    // State
+    bool mRecording = false;
+    bool mIsRunning = false;
+
+    // Create functions
+    void createCommandPool();
+    void createCommandBuffer();
+    void createTimestampQueryPool(uint32_t totalTimestamps);
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
new file mode 100644
index 000000000..4c260ce6b
--- /dev/null
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "logger/Logger.hpp"
+#include <memory>
+#include <string>
+
+namespace kp {
+
+/**
+ * Structured data used in GPU operations.
+ *
+ * Tensors are the base building block in Kompute to perform operations across
+ * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
+ * would be used to store their respective data. The tensors can be used for GPU
+ * data storage or transfer.
+ */
+class Tensor
+{
+  public:
+    /**
+     * Type for tensors created: Device allows memory to be transferred from
+     * staging buffers. Staging are host memory visible. Storage are device
+     * visible but are not set up to transfer or receive data (only for shader
+     * storage).
+     */
+    enum class TensorTypes
+    {
+        eDevice = 0,  ///< Type is device memory, source and destination
+        eHost = 1,    ///< Type is host memory, source and destination
+        eStorage = 2, ///< Type is Device memory (only)
+    };
+    enum class TensorDataTypes
+    {
+        eBool = 0,
+        eInt = 1,
+        eUnsignedInt = 2,
+        eFloat = 3,
+        eDouble = 4,
+    };
+
+    static std::string toString(TensorDataTypes dt);
+    static std::string toString(TensorTypes dt);
+
+    /**
+     *  Constructor with data provided which would be used to create the
+     * respective vulkan buffer and memory.
+     *
+     *  @param physicalDevice The physical device to use to fetch properties
+     *  @param device The device to use to create the buffer and memory from
+     *  @param data Non-zero-sized vector of data that will be used by the
+     * tensor
+     *  @param tensorTypes Type for the tensor which is of type TensorTypes
+     */
+    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
+           std::shared_ptr<vk::Device> device,
+           void* data,
+           uint32_t elementTotalCount,
+           uint32_t memorySize,
+           const TensorDataTypes& dataType,
+           vk::DeviceMemory *primaryMemory,
+           vk::Buffer *primaryBuffer,
+           vk::DeviceMemory *stagingMemory,
+           vk::Buffer *stagingBuffer,
+           vk::DeviceSize offset,
+           const TensorTypes& tensorType = TensorTypes::eDevice);
+
+    /**
+     * Destructor which is in charge of freeing vulkan resources unless they
+     * have been provided externally.
+     */
+    virtual ~Tensor();
+
+    /**
+     * Function to trigger reinitialisation of the tensor buffer and memory with
+     * new data as well as new potential device type.
+     *
+     * @param data Vector of data to use to initialise vector from
+     * @param tensorType The type to use for the tensor
+     */
+    void rebuild(void* data,
+                 uint32_t elementTotalCount,
+                 uint64_t memorySize,
+                 vk::DeviceMemory *primaryMemory,
+                 vk::Buffer *primaryBuffer,
+                 vk::DeviceMemory *stagingMemory,
+                 vk::Buffer *stagingBuffer,
+                 vk::DeviceSize offset);
+
+    /**
+     * Destroys and frees the GPU resources which include the buffer and memory.
+     */
+    void destroy();
+
+    /**
+     * Check whether tensor is initialized based on the created gpu resources.
+     *
+     * @returns Boolean stating whether tensor is initialized
+     */
+    bool isInit();
+
+    /**
+     * Retrieve the tensor type of the Tensor
+     *
+     * @return Tensor type of tensor
+     */
+    TensorTypes tensorType();
+
+    /**
+     * Records a copy from the memory of the tensor provided to the current
+     * thensor. This is intended to pass memory into a processing, to perform
+     * a staging buffer transfer, or to gather output (between others).
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param copyFromTensor Tensor to copy the data from
+     */
+    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
+                        std::shared_ptr<Tensor> copyFromTensor);
+
+    /**
+     * Records a copy from the internal staging memory to the device memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records a copy from the internal device memory to the staging memory
+     * using an optional barrier to wait for the operation. This function would
+     * only be relevant for kp::Tensors of type eDevice.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     */
+    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
+
+    /**
+     * Records the buffer memory barrier into the primary buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordPrimaryBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+    /**
+     * Records the buffer memory barrier into the staging buffer and command
+     * buffer which ensures that relevant data transfers are carried out
+     * correctly.
+     *
+     * @param commandBuffer Vulkan Command Buffer to record the commands into
+     * @param srcAccessMask Access flags for source access mask
+     * @param dstAccessMask Access flags for destination access mask
+     * @param scrStageMask Pipeline stage flags for source stage mask
+     * @param dstStageMask Pipeline stage flags for destination stage mask
+     */
+    void recordStagingBufferMemoryBarrier(
+      const vk::CommandBuffer& commandBuffer,
+      vk::AccessFlagBits srcAccessMask,
+      vk::AccessFlagBits dstAccessMask,
+      vk::PipelineStageFlagBits srcStageMask,
+      vk::PipelineStageFlagBits dstStageMask);
+
+    /**
+     * Constructs a vulkan descriptor buffer info which can be used to specify
+     * and reference the underlying buffer component of the tensor without
+     * exposing it.
+     *
+     * @return Descriptor buffer info with own buffer
+     */
+    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
+
+    /**
+     * Returns the size/magnitude of the Tensor, which will be the total number
+     * of elements across all dimensions
+     *
+     * @return Unsigned integer representing the total number of elements
+     */
+    uint32_t size();
+
+    /**
+     * Returns the total memory size of the data contained by the Tensor object
+     *
+     * @return Unsigned integer representing the memory of the tensor in bytes.
+     */
+    uint64_t memorySize();
+
+    /**
+     * Retrieve the data type of the tensor (host, device, storage)
+     *
+     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
+     */
+    TensorDataTypes dataType();
+
+    /**
+     * Retrieve the raw data via the pointer to the memory that contains the raw
+     * memory of this current tensor. This tensor gets changed to a nullptr when
+     * the Tensor is removed.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    void* rawData();
+
+    /**
+     * Sets / resets the data of the tensor which is directly done on the GPU
+     * host visible memory available by the tensor.
+     */
+    void setRawData(const void* data);
+
+    /**
+     * Template to return the pointer data converted by specific type, which
+     * would be any of the supported types including float, double, int32,
+     * uint32 and bool.
+     *
+     * @return Pointer to raw memory containing raw bytes data of Tensor.
+     */
+    template<typename T>
+    T* data()
+    {
+        return (T*)this->mRawData;
+    }
+
+    /**
+     * Template to get the data of the current tensor as a vector of specific
+     * type, which would be any of the supported types including float, double,
+     * int32, uint32 and bool.
+     *
+     * @return Vector of type provided by template.
+     */
+    template<typename T>
+    std::vector<T> vector()
+    {
+        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
+    }
+
+  protected:
+    // -------------- ALWAYS OWNED RESOURCES
+    TensorTypes mTensorType;
+    TensorDataTypes mDataType;
+    uint32_t mSize = 0;
+    uint64_t mMemorySize = 0;
+    vk::DeviceSize mOffset = 0;
+    void* mRawData = nullptr;
+
+  private:
+    // -------------- NEVER OWNED RESOURCES
+    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
+    std::shared_ptr<vk::Device> mDevice;
+    vk::Buffer *mPrimaryBuffer = nullptr;
+    vk::Buffer *mStagingBuffer = nullptr;
+    vk::DeviceMemory *mPrimaryMemory = nullptr;
+    vk::DeviceMemory *mStagingMemory = nullptr;
+
+    void setGPUResources(vk::DeviceMemory *primaryMemory,
+                         vk::Buffer *primaryBuffer,
+                         vk::DeviceMemory *stagingMemory,
+                         vk::Buffer *stagingBuffer,
+                         vk::DeviceSize offset);
+    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
+                          vk::Buffer *bufferFrom,
+                          vk::Buffer *bufferTo,
+                          vk::DeviceSize bufferSize,
+                          vk::BufferCopy copyRegion);
+    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
+                                   const vk::Buffer& buffer,
+                                   vk::AccessFlagBits srcAccessMask,
+                                   vk::AccessFlagBits dstAccessMask,
+                                   vk::PipelineStageFlagBits srcStageMask,
+                                   vk::PipelineStageFlagBits dstStageMask);
+
+    // Private util functions
+    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
+    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
+    vk::BufferUsageFlags getStagingBufferUsageFlags();
+    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
+};
+
+template<typename T>
+class TensorT : public Tensor
+{
+
+  public:
+    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
+
+    TensorDataTypes dataType();
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/logger/Logger.hpp b/kompute/src/include/kompute/logger/Logger.hpp
new file mode 100644
index 000000000..f97e95cf0
--- /dev/null
+++ b/kompute/src/include/kompute/logger/Logger.hpp
@@ -0,0 +1,197 @@
+#pragma once
+
+#define KOMPUTE_LOG_LEVEL_TRACE 0
+#define KOMPUTE_LOG_LEVEL_DEBUG 1
+#define KOMPUTE_LOG_LEVEL_INFO 2
+#define KOMPUTE_LOG_LEVEL_WARN 3
+#define KOMPUTE_LOG_LEVEL_ERROR 4
+#define KOMPUTE_LOG_LEVEL_CRITICAL 5
+#define KOMPUTE_LOG_LEVEL_OFF 6
+
+// Logging is disabled entirely.
+#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#define KP_LOG_TRACE(...)
+#define KP_LOG_DEBUG(...)
+#define KP_LOG_INFO(...)
+#define KP_LOG_WARN(...)
+#define KP_LOG_ERROR(...)
+#else
+
+#if !KOMPUTE_OPT_USE_SPDLOG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#include <android/log.h>
+#include <fmt/core.h>
+static const char* KOMPUTE_LOG_TAG = "KomputeLog";
+#else
+#if KOMPUTE_BUILD_PYTHON
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+// from python/src/main.cpp
+extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
+#else
+#include <fmt/core.h>
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#include <spdlog/spdlog.h>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+#include <set>
+#include <string>
+#include <vector>
+namespace logger {
+// Setup the logger, note the loglevel can not be set below the CMake log level
+// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
+void
+setupLogger();
+
+// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
+// else is defined, overriding logging.
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+#ifndef KP_LOG_TRACE
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_TRACE(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_TRACE(...)                                                      \
+    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_TRACE(...)
+#endif
+#endif // !KP_LOG_TRACE
+
+#ifndef KP_LOG_DEBUG
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_DEBUG(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
+#else
+#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE_NAME__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_DEBUG(...)                                                      \
+    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // __FILE__NAME__
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_DEBUG(...)
+#endif
+#endif // !KP_LOG_DEBUG
+
+#ifndef KP_LOG_INFO
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_INFO(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_INFO(...)                                                       \
+    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_INFO(...)
+#endif
+#endif // !KP_LOG_INFO
+
+#ifndef KP_LOG_WARN
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_WARN(...)                                                       \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_WARN(...)                                                       \
+    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_WARN(...)
+#endif
+#endif // !KP_LOG_WARN
+
+#ifndef KP_LOG_ERROR
+#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
+#if VK_USE_PLATFORM_ANDROID_KHR
+#define KP_LOG_ERROR(...)                                                      \
+    ((void)__android_log_write(                                                \
+      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
+#else
+#if KOMPUTE_BUILD_PYTHON
+#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
+#else
+#define KP_LOG_ERROR(...)                                                      \
+    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
+               __DATE__,                                                       \
+               __TIME__,                                                       \
+               __FILE__,                                                       \
+               __LINE__,                                                       \
+               fmt::format(__VA_ARGS__))
+#endif // KOMPUTE_BUILD_PYTHON
+#endif // VK_USE_PLATFORM_ANDROID_KHR
+#else
+#define KP_LOG_ERROR(...)
+#endif
+#endif // !KP_LOG_ERROR
+#else
+
+#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+
+void
+setLogLevel(spdlog::level::level_enum level);
+
+spdlog::level::level_enum
+getLogLevel();
+
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
diff --git a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
new file mode 100644
index 000000000..e91598f05
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * By default it enables the user to provide a dynamic number of tensors
+ * which are then passed as inputs.
+ */
+class OpAlgoDispatch : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores the algorithm to use as well as the relevant
+     * push constants to override when recording.
+     *
+     * @param algorithm The algorithm object to use for dispatch
+     * @param pushConstants The push constants to use for override
+     */
+    template<typename T = float>
+    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
+                   const std::vector<T>& pushConstants = {})
+    {
+        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
+
+        this->mAlgorithm = algorithm;
+
+        if (pushConstants.size()) {
+            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
+            uint32_t size = pushConstants.size();
+            uint32_t totalSize = size * memorySize;
+            this->mPushConstantsData = malloc(totalSize);
+            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
+            this->mPushConstantsDataTypeMemorySize = memorySize;
+            this->mPushConstantsSize = size;
+        }
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    virtual ~OpAlgoDispatch() override;
+
+    /**
+     * This records the commands that are to be sent to the GPU. This includes
+     * the barriers that ensure the memory has been copied before going in and
+     * out of the shader, as well as the dispatch operation that sends the
+     * shader processing to the gpu. This function also records the GPU memory
+     * copy of the output data for the staging buffer so it can be read by the
+     * host.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::shared_ptr<Algorithm> mAlgorithm;
+    void* mPushConstantsData = nullptr;
+    uint32_t mPushConstantsDataTypeMemorySize = 0;
+    uint32_t mPushConstantsSize = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBase.hpp b/kompute/src/include/kompute/operations/OpBase.hpp
new file mode 100644
index 000000000..737670846
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBase.hpp
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+/**
+ *  Base Operation which provides the high level interface that Kompute
+ *  operations implement in order to perform a set of actions in the GPU.
+ *
+ *  Operations can perform actions on tensors, and optionally can also own an
+ *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
+ *  would inherit from kp::OpBaseAlgo.
+ */
+class OpBase
+{
+  public:
+    /**
+     * Default destructor for OpBase class. This OpBase destructor class should
+     * always be called to destroy and free owned resources unless it is
+     * intended to destroy the resources in the parent class.
+     */
+    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
+
+    /**
+     * The record function is intended to only send a record command or run
+     * commands that are expected to record operations that are to be submitted
+     * as a batch into the GPU.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Pre eval is called before the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * per-eval setup steps required as the computation iteration begins. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are created should be idempotent in case
+     * it's called multiple times in a row.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
+
+    /**
+     * Post eval is called after the Sequence has called eval and submitted the
+     * commands to the GPU for processing, and can be used to perform any
+     * tear-down steps required as the computation iteration finishes. It's
+     * worth noting that there are situations where eval can be called multiple
+     * times, so the resources that are destroyed should not require a re-init
+     * unless explicitly provided by the user.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
new file mode 100644
index 000000000..50d8e9707
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncDevice : public OpBase
+{
+  public:
+    OpBufferSyncDevice(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncDevice() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
new file mode 100644
index 000000000..7db997199
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+class OpBufferSyncLocal : public OpBase
+{
+  public:
+    OpBufferSyncLocal(
+        vk::Buffer *primaryBuffer,
+        vk::Buffer *stagingBuffer,
+        vk::DeviceSize size);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpBufferSyncLocal() override;
+
+    /**
+     * For device buffers, it records the copy command for the buffer to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
new file mode 100644
index 000000000..4a2322323
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that provides a general abstraction that simplifies the use of
+ * algorithm and parameter components which can be used with shaders.
+ * It exposes the pipeline barrier functionality specifically for memory
+ * barriers that can be configured through the respective source and destination
+ * masks
+ */
+class OpMemoryBarrier : public OpBase
+{
+  public:
+    /**
+     * Constructor that stores tensors as well as memory barrier parameters to
+     * be used to create a pipeline barrier on the respective primary or staging
+     * tensor.
+     *
+     * @param tensors The tensors to apply the memory barriers on
+     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
+     * @param dstAccessMask The kp::AccessFlagBits for the destination access
+     * mask
+     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
+     * mask
+     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
+     * stage mask
+     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
+     * tensors
+     */
+    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
+                    const vk::AccessFlagBits& srcAccessMask,
+                    const vk::AccessFlagBits& dstAccessMask,
+                    const vk::PipelineStageFlagBits& srcStageMask,
+                    const vk::PipelineStageFlagBits& dstStageMask,
+                    bool barrierOnPrimary = true);
+
+    /**
+     * Default destructor, which is in charge of destroying the reference to the
+     * tensors and all the relevant access / stage masks created
+     */
+    virtual ~OpMemoryBarrier() override;
+
+    /**
+     * This records the memory barrier with the access and stage masks provided
+     * across all relevant tensors.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    const vk::AccessFlagBits mSrcAccessMask;
+    const vk::AccessFlagBits mDstAccessMask;
+    const vk::PipelineStageFlagBits mSrcStageMask;
+    const vk::PipelineStageFlagBits mDstStageMask;
+    const bool mBarrierOnPrimary;
+    const std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMult.hpp b/kompute/src/include/kompute/operations/OpMult.hpp
new file mode 100644
index 000000000..f75ccc4fb
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpMult.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <fstream>
+
+#include "kompute/Core.hpp"
+
+#include "ShaderOpMult.hpp"
+
+#include "kompute/Algorithm.hpp"
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpAlgoDispatch.hpp"
+
+namespace kp {
+
+/**
+ * Operation that performs multiplication on two tensors and outpus on third
+ * tensor.
+ */
+class OpMult : public OpAlgoDispatch
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the bare minimum
+     * requirements for the operations to be able to create and manage their
+     * sub-components.
+     *
+     * @param tensors Tensors that are to be used in this operation
+     * @param algorithm An algorithm that will be overridden with the OpMult
+     * shader data and the tensors provided which are expected to be 3
+     */
+    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
+           std::shared_ptr<Algorithm> algorithm)
+      : OpAlgoDispatch(algorithm)
+    {
+        KP_LOG_DEBUG("Kompute OpMult constructor with params");
+
+        if (tensors.size() != 3) {
+            throw std::runtime_error(
+              "Kompute OpMult expected 3 tensors but got " +
+              std::to_string(tensors.size()));
+        }
+
+        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
+          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
+
+        algorithm->rebuild<>(tensors, spirv);
+    }
+
+    /**
+     * Default destructor, which is in charge of destroying the algorithm
+     * components but does not destroy the underlying tensors
+     */
+    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorCopy.hpp b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
new file mode 100644
index 000000000..968c1065a
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that copies the data from the first tensor to the rest of the
+ * tensors provided, using a record command for all the vectors. This operation
+ * does not own/manage the memory of the tensors passed to it. The operation
+ * must only receive tensors of type
+ */
+class OpTensorCopy : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorCopy() override;
+
+    /**
+     * Records the copy commands from the first tensor into all the other
+     * tensors provided. Also optionally records a barrier.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Copies the local vectors for all the tensors to sync the data with the
+     * gpu.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
new file mode 100644
index 000000000..9b39e490f
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+#include "kompute/Tensor.hpp"
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's device by mapping local data into the device
+ * memory. For TensorTypes::eDevice it will use a record operation for the
+ * memory to be syncd into GPU memory which means that the operation will be
+ * done in sync with GPU commands. For TensorTypes::eHost it will only map the
+ * data into host memory which will happen during preEval before the recorded
+ * commands are dispatched.
+ */
+class OpTensorSyncDevice : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensos
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncDevice() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its staging to device memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+    vk::Buffer *mPrimaryBuffer;
+    vk::Buffer *mStagingBuffer;
+    vk::DeviceSize mSize;
+};
+
+} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
new file mode 100644
index 000000000..4216003e5
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that syncs tensor's local memory by mapping device data into the
+ * local CPU memory. For TensorTypes::eDevice it will use a record operation
+ * for the memory to be syncd into GPU memory which means that the operation
+ * will be done in sync with GPU commands. For TensorTypes::eHost it will
+ * only map the data into host memory which will happen during preEval before
+ * the recorded commands are dispatched.
+ */
+class OpTensorSyncLocal : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation. The tensors
+     * provided cannot be of type TensorTypes::eStorage.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorSyncLocal() override;
+
+    /**
+     * For device tensors, it records the copy command for the tensor to copy
+     * the data from its device to staging memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * For host tensors it performs the map command from the host memory into
+     * local memory.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
new file mode 100644
index 000000000..1dcc1e6b5
--- /dev/null
+++ b/kompute/src/logger/CMakeLists.txt
@@ -0,0 +1,69 @@
+cmake_minimum_required(VERSION 3.20)
+
+set(LOGGER_SOURCES Logger.cpp)
+
+add_library(kp_logger ${LOGGER_SOURCES})
+
+# Define log levels in code
+add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
+add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
+
+if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
+    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
+endif()
+
+if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
+    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
+    message(STATUS "Using log level Trace")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
+    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
+    message(STATUS "Using log level Debug")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
+    set(KOMPUTE_OPT_LOG_LEVEL INFO)
+    message(STATUS "Using log level Info")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
+    set(KOMPUTE_OPT_LOG_LEVEL WARN)
+    message(STATUS "Using log level Warn")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
+    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
+    message(STATUS "Using log level Error")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
+    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
+    message(STATUS "Using log level Critical")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
+    set(KOMPUTE_OPT_LOG_LEVEL OFF)
+    message(STATUS "Using log level Off")
+elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
+    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
+    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
+else()
+    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
+endif()
+
+# Always make sure we define the Kompute log level independent of the Spdlog log level
+target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+
+# Link depending on how the logger should be setup
+if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
+    if(KOMPUTE_OPT_USE_SPDLOG)
+        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
+        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
+        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
+
+        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
+            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
+        endif()
+    else()
+        target_link_libraries(kp_logger PUBLIC fmt::fmt)
+    endif()
+endif()
diff --git a/kompute/src/logger/Logger.cpp b/kompute/src/logger/Logger.cpp
new file mode 100644
index 000000000..69df2b609
--- /dev/null
+++ b/kompute/src/logger/Logger.cpp
@@ -0,0 +1,101 @@
+#include "kompute/logger/Logger.hpp"
+
+#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
+#if !KOMPUTE_OPT_USE_SPDLOG
+#else
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <spdlog/async.h>
+#include <spdlog/common.h>
+#include <spdlog/logger.h>
+#include <spdlog/sinks/stdout_color_sinks.h>
+#include <spdlog/spdlog.h>
+#include <string>
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+
+namespace logger {
+#if !KOMPUTE_OPT_USE_SPDLOG
+
+void
+setupLogger()
+{
+}
+
+#else
+constexpr int THREAD_QUEUE_LENGTH = 8192;
+
+void
+setupLogger()
+{
+    // Ensure we setup the logger only once
+    static bool setup = false;
+    static std::mutex setupMutex{};
+    setupMutex.lock();
+    if (setup) {
+        setupMutex.unlock();
+        return;
+    }
+    setup = true;
+    setupMutex.unlock();
+
+    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
+    spdlog::sink_ptr console_sink =
+      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
+#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
+#else
+    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
+#endif
+    std::vector<spdlog::sink_ptr> sinks{ console_sink };
+    // TODO: Add flag in compile flags
+    std::shared_ptr<spdlog::logger> logger =
+#if KOMPUTE_SPDLOG_ASYNC_LOGGING
+          std::make_shared<spdlog::async_logger>(
+            "",
+            sinks.begin(),
+            sinks.end(),
+            spdlog::thread_pool(),
+            spdlog::async_overflow_policy::block);
+#else
+          std::make_shared<spdlog::logger>(
+            "",
+            sinks.begin(),
+            sinks.end());
+#endif
+
+    logger->set_level(getLogLevel());
+
+    spdlog::set_default_logger(logger);
+}
+
+spdlog::level::level_enum
+getLogLevel()
+{
+#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
+    return spdlog::level::trace;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
+    return spdlog::level::debug;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
+    return spdlog::level::info;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
+    return spdlog::level::warn;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
+    return spdlog::level::error;
+#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
+    return spdlog::level::critical;
+#else
+    return spdlog::level::off;
+#endif
+}
+
+void
+setLogLevel(const spdlog::level::level_enum level)
+{
+    spdlog::default_logger()->set_level(level);
+}
+#endif // !KOMPUTE_OPT_USE_SPDLOG
+} // namespace logger
+
+#endif
diff --git a/kompute/src/shaders/CMakeLists.txt b/kompute/src/shaders/CMakeLists.txt
new file mode 100644
index 000000000..901bf3e8a
--- /dev/null
+++ b/kompute/src/shaders/CMakeLists.txt
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+add_subdirectory(glsl)
\ No newline at end of file
diff --git a/kompute/src/shaders/glsl/CMakeLists.txt b/kompute/src/shaders/glsl/CMakeLists.txt
new file mode 100644
index 000000000..3101a2b17
--- /dev/null
+++ b/kompute/src/shaders/glsl/CMakeLists.txt
@@ -0,0 +1,26 @@
+# SPDX-License-Identifier: Apache-2.0
+# ######################
+cmake_minimum_required(VERSION 3.20)
+
+# Check if build shaders from source is enabled
+if(KOMPUTE_OPT_BUILD_SHADERS)
+    vulkan_compile_shader(INFILE ShaderOpMult.comp
+        OUTFILE ShaderOpMult.hpp
+        NAMESPACE "kp")
+
+    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
+        OUTFILE ShaderLogisticRegression.hpp
+        NAMESPACE "kp")
+else() # Else we will use our precompiled versions
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
+    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
+endif()
+
+add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
+    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
+
+target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
+
+# Make sure we install shaders:
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
new file mode 100644
index 000000000..5a1c5d948
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
@@ -0,0 +1,52 @@
+#version 450
+
+layout (constant_id = 0) const float m = 0;
+
+layout (local_size_x = 1) in;
+
+layout(set = 0, binding = 0) buffer bxi { float xi[]; };
+layout(set = 0, binding = 1) buffer bxj { float xj[]; };
+layout(set = 0, binding = 2) buffer by { float y[]; };
+layout(set = 0, binding = 3) buffer bwin { float win[]; };
+layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
+layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
+layout(set = 0, binding = 6) buffer bbin { float bin[]; };
+layout(set = 0, binding = 7) buffer bbout { float bout[]; };
+layout(set = 0, binding = 8) buffer blout { float lout[]; };
+
+float sigmoid(float z) {
+    return 1.0 / (1.0 + exp(-z));
+}
+
+float inference(vec2 x, vec2 w, float b) {
+    // Compute the linear mapping function
+    float z = dot(w, x) + b;
+    // Calculate the y-hat with sigmoid
+    float yHat = sigmoid(z);
+    return yHat;
+}
+
+float calculateLoss(float yHat, float y) {
+    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
+}
+
+void main() {
+    uint idx = gl_GlobalInvocationID.x;
+
+    vec2 wCurr = vec2(win[0], win[1]);
+    float bCurr = bin[0];
+
+    vec2 xCurr = vec2(xi[idx], xj[idx]);
+    float yCurr = y[idx];
+
+    float yHat = inference(xCurr, wCurr, bCurr);
+
+    float dZ = yHat - yCurr;
+    vec2 dW = (1. / m) * xCurr * dZ;
+    float dB = (1. / m) * dZ;
+    wouti[idx] = dW.x;
+    woutj[idx] = dW.y;
+    bout[idx] = dB;
+
+    lout[idx] = calculateLoss(yHat, yCurr);
+}
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
new file mode 100644
index 000000000..bfe7792c6
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
@@ -0,0 +1,310 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x00000041, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
+0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
+0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
+0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
+0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
+0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
+0x00000077, 0x00030005, 0x00000011, 0x00000062, 
+0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
+0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
+0x00040005, 0x00000015, 0x74614879, 0x00000000, 
+0x00030005, 0x00000016, 0x00000079, 0x00030005, 
+0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
+0x74614879, 0x00000000, 0x00040005, 0x00000028, 
+0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
+0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
+0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
+0x00000044, 0x00040005, 0x00000046, 0x72754377, 
+0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
+0x00000000, 0x00040006, 0x00000048, 0x00000000, 
+0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
+0x00040005, 0x00000054, 0x72754362, 0x00000072, 
+0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
+0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
+0x00030005, 0x00000058, 0x00000000, 0x00040005, 
+0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
+0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
+0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
+0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
+0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
+0x00030005, 0x00000066, 0x00000000, 0x00040005, 
+0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
+0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
+0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
+0x00000000, 0x00040005, 0x00000073, 0x74614879, 
+0x00000000, 0x00040005, 0x00000074, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
+0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
+0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
+0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
+0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
+0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
+0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
+0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
+0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
+0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
+0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
+0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
+0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
+0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
+0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
+0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
+0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
+0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
+0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
+0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
+0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
+0x00000004, 0x00050048, 0x00000048, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000048, 
+0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
+0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
+0x00000003, 0x00040047, 0x00000055, 0x00000006, 
+0x00000004, 0x00050048, 0x00000056, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000056, 
+0x00000003, 0x00040047, 0x00000058, 0x00000022, 
+0x00000000, 0x00040047, 0x00000058, 0x00000021, 
+0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
+0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
+0x00000000, 0x00040047, 0x00000063, 0x00000006, 
+0x00000004, 0x00050048, 0x00000064, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000064, 
+0x00000003, 0x00040047, 0x00000066, 0x00000022, 
+0x00000000, 0x00040047, 0x00000066, 0x00000021, 
+0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
+0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
+0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
+0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
+0x00000002, 0x00040047, 0x00000080, 0x00000001, 
+0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
+0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
+0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
+0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
+0x00000004, 0x00040047, 0x00000092, 0x00000006, 
+0x00000004, 0x00050048, 0x00000093, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x00000093, 
+0x00000003, 0x00040047, 0x00000095, 0x00000022, 
+0x00000000, 0x00040047, 0x00000095, 0x00000021, 
+0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
+0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
+0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
+0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
+0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
+0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
+0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
+0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
+0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
+0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
+0x00000019, 0x00020013, 0x00000002, 0x00030021, 
+0x00000003, 0x00000002, 0x00030016, 0x00000006, 
+0x00000020, 0x00040020, 0x00000007, 0x00000007, 
+0x00000006, 0x00040021, 0x00000008, 0x00000006, 
+0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
+0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
+0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
+0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
+0x00000014, 0x00000006, 0x00000007, 0x00000007, 
+0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
+0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
+0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
+0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
+0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
+0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
+0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
+0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
+0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
+0x00000048, 0x00000047, 0x00040020, 0x00000049, 
+0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
+0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
+0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
+0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
+0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
+0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
+0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
+0x00040020, 0x00000057, 0x00000002, 0x00000056, 
+0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
+0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
+0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
+0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
+0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
+0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
+0x00040020, 0x00000065, 0x00000002, 0x00000064, 
+0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
+0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
+0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
+0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
+0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
+0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
+0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
+0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
+0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
+0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
+0x00000093, 0x00000092, 0x00040020, 0x00000094, 
+0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
+0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
+0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
+0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
+0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
+0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
+0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
+0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
+0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
+0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
+0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
+0x00050036, 0x00000002, 0x00000004, 0x00000000, 
+0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
+0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
+0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
+0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
+0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
+0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
+0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
+0x00000043, 0x00000044, 0x00000041, 0x00000042, 
+0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
+0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
+0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
+0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
+0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
+0x00000006, 0x00000052, 0x00000051, 0x00050050, 
+0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
+0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
+0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
+0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
+0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
+0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
+0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
+0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
+0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
+0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
+0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
+0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
+0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
+0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
+0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
+0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
+0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
+0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
+0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
+0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
+0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
+0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
+0x00000078, 0x00000079, 0x00070039, 0x00000006, 
+0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
+0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
+0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
+0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
+0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
+0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
+0x00050088, 0x00000006, 0x00000081, 0x00000019, 
+0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
+0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
+0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
+0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
+0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
+0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
+0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
+0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
+0x00000006, 0x00000089, 0x00000087, 0x00000088, 
+0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
+0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
+0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
+0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
+0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
+0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
+0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
+0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
+0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
+0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
+0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
+0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
+0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
+0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
+0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
+0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
+0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
+0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
+0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
+0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
+0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
+0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
+0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
+0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
+0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
+0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
+0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
+0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
+0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
+0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
+0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
+0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
+0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
+0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
+0x00050036, 0x00000006, 0x00000012, 0x00000000, 
+0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
+0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
+0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
+0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
+0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
+0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
+0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
+0x00050094, 0x00000006, 0x00000024, 0x00000022, 
+0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
+0x00000011, 0x00050081, 0x00000006, 0x00000026, 
+0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
+0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
+0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
+0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
+0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
+0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
+0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
+0x00000006, 0x00000017, 0x00000000, 0x00000014, 
+0x00030037, 0x00000007, 0x00000015, 0x00030037, 
+0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
+0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
+0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
+0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
+0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
+0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
+0x00000006, 0x00000032, 0x00000016, 0x00050083, 
+0x00000006, 0x00000033, 0x00000019, 0x00000032, 
+0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
+0x00050083, 0x00000006, 0x00000035, 0x00000019, 
+0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
+0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
+0x00000006, 0x00000037, 0x00000033, 0x00000036, 
+0x00050081, 0x00000006, 0x00000038, 0x00000031, 
+0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
+0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.comp b/kompute/src/shaders/glsl/ShaderOpMult.comp
new file mode 100644
index 000000000..d54865037
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.comp
@@ -0,0 +1,28 @@
+#version 450
+
+layout(set = 0, binding = 0) buffer tensorLhs {
+   float valuesLhs[ ];
+};
+
+layout(set = 0, binding = 1) buffer tensorRhs {
+   float valuesRhs[ ];
+};
+
+layout(set = 0, binding = 2) buffer tensorOutput {
+   float valuesOutput[ ];
+};
+
+layout (constant_id = 0) const uint LEN_LHS = 0;
+layout (constant_id = 1) const uint LEN_RHS = 0;
+layout (constant_id = 2) const uint LEN_OUT = 0;
+
+layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+
+void main() 
+{
+	uint index = gl_GlobalInvocationID.x;
+
+    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
+}
+
+
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
new file mode 100644
index 000000000..5af29c66d
--- /dev/null
+++ b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
@@ -0,0 +1,101 @@
+#pragma once
+#include <array>
+#include <cstdint>
+
+namespace kp {
+const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
+0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
+0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
+0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
+0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
+0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
+0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
+0x00000011, 0x00000001, 0x00000001, 0x00000001, 
+0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
+0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
+0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
+0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
+0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
+0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
+0x00000000, 0x00070006, 0x00000012, 0x00000000, 
+0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
+0x00030005, 0x00000014, 0x00000000, 0x00050005, 
+0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
+0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
+0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
+0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
+0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
+0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
+0x00030005, 0x00000023, 0x00000000, 0x00040005, 
+0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
+0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
+0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
+0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
+0x00000011, 0x00000006, 0x00000004, 0x00050048, 
+0x00000012, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000012, 0x00000003, 0x00040047, 
+0x00000014, 0x00000022, 0x00000000, 0x00040047, 
+0x00000014, 0x00000021, 0x00000002, 0x00040047, 
+0x00000018, 0x00000006, 0x00000004, 0x00050048, 
+0x00000019, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000019, 0x00000003, 0x00040047, 
+0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
+0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
+0x00000020, 0x00000006, 0x00000004, 0x00050048, 
+0x00000021, 0x00000000, 0x00000023, 0x00000000, 
+0x00030047, 0x00000021, 0x00000003, 0x00040047, 
+0x00000023, 0x00000022, 0x00000000, 0x00040047, 
+0x00000023, 0x00000021, 0x00000001, 0x00040047, 
+0x00000029, 0x00000001, 0x00000000, 0x00040047, 
+0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
+0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
+0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
+0x00000002, 0x00030021, 0x00000003, 0x00000002, 
+0x00040015, 0x00000006, 0x00000020, 0x00000000, 
+0x00040020, 0x00000007, 0x00000007, 0x00000006, 
+0x00040017, 0x00000009, 0x00000006, 0x00000003, 
+0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
+0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
+0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
+0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
+0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
+0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
+0x00000011, 0x00040020, 0x00000013, 0x00000002, 
+0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
+0x00000002, 0x00040015, 0x00000015, 0x00000020, 
+0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
+0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
+0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
+0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
+0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
+0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
+0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
+0x00000020, 0x00040020, 0x00000022, 0x00000002, 
+0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
+0x00000002, 0x00040032, 0x00000006, 0x00000029, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
+0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
+0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
+0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
+0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
+0x00000002, 0x00000004, 0x00000000, 0x00000003, 
+0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
+0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
+0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
+0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
+0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
+0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
+0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
+0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
+0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
+0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
+0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
+0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
+0x00000026, 0x00000025, 0x00050085, 0x00000010, 
+0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
+0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
+0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
+0x000100fd, 0x00010038 };
+} // namespace kp
+
+
diff --git a/kompute/src/shaders/hlsl/computeheadless.comp b/kompute/src/shaders/hlsl/computeheadless.comp
new file mode 100644
index 000000000..ee3cd024f
--- /dev/null
+++ b/kompute/src/shaders/hlsl/computeheadless.comp
@@ -0,0 +1,29 @@
+// Copyright 2020 Google LLC
+
+RWStructuredBuffer<uint> values : register(u0);
+[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
+
+uint fibonacci(uint n) {
+	if(n <= 1){
+		return n;
+	}
+	uint curr = 1;
+	uint prev = 1;
+	for(uint i = 2; i < n; ++i) {
+		uint temp = curr;
+		curr += prev;
+		prev = temp;
+	}
+	return curr;
+}
+
+[numthreads(1, 1, 1)]
+void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
+{
+	uint index = GlobalInvocationID.x;
+	if (index >= BUFFER_ELEMENTS)
+		return;
+	values[index] = fibonacci(values[index]);
+}
+
+
diff --git a/llama.cpp b/llama.cpp
index 6e23a0772..c835c6fd4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9,6 +9,8 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#   include "ggml-vulkan.h"
 #endif
 
 #ifdef GGML_USE_METAL
@@ -1182,11 +1184,14 @@ struct llama_context {
 
 #ifdef GGML_USE_METAL
     ggml_metal_context * ctx_metal = NULL;
+#elif defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute = NULL;
 #endif
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
+
 };
 
 //
@@ -2474,6 +2479,9 @@ static struct ggml_cgraph * llm_build_llama(
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif
 
     if (tokens) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -2483,6 +2491,9 @@ static struct ggml_cgraph * llm_build_llama(
             memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
 
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
@@ -2491,6 +2502,9 @@ static struct ggml_cgraph * llm_build_llama(
 #endif
 
         inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
 
         ggml_allocr_alloc(lctx.alloc, inpL);
         if (!ggml_allocr_is_measure(lctx.alloc)) {
@@ -2693,7 +2707,6 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func(cur);
                 ggml_set_name(cur, "ffn_norm");
             }
-
             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
                     model.layers[il].w3,
                     cur);
@@ -2752,6 +2765,16 @@ static struct ggml_cgraph * llm_build_llama(
 
     ggml_free(ctx0);
 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif
+
     return gf;
 }
 
@@ -3792,6 +3815,17 @@ static bool llama_eval_internal(
     } else {
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
+#elif defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
+        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
+    } else {
+        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
+    }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
 #endif
@@ -3833,12 +3867,12 @@ static bool llama_eval_internal(
     }
 
     // extract embeddings
-    if (!lctx.embedding.empty()) {
-        auto & embedding_out = lctx.embedding;
+    //if (!lctx.embedding.empty()) {
+    //    auto & embedding_out = lctx.embedding;
 
-        embedding_out.resize(n_embd);
-        memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
-    }
+    //    embedding_out.resize(n_embd);
+    //    memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+    //}
 
     // measure the performance only for the single-token evals
     if (N == 1) {
@@ -5904,6 +5938,7 @@ static int llama_apply_lora_from_file_internal(
 ) {
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
+
     const int64_t t_start_lora_us = ggml_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);
diff --git a/llama.h b/llama.h
index 350268b9a..3d911adca 100644
--- a/llama.h
+++ b/llama.h
@@ -42,7 +42,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 1
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif
diff --git a/undump.py b/undump.py
new file mode 100644
index 000000000..db19ffe69
--- /dev/null
+++ b/undump.py
@@ -0,0 +1,18 @@
+import struct
+import numpy as np
+from pathlib import Path
+
+def undump(fn):
+    with open(fn, 'rb') as df:
+        dims = struct.unpack('=QQQQ', df.read(8*4))
+        (dsz,) = struct.unpack('=Q', df.read(8))
+        ## assume f32
+        data = df.read(dsz)
+        data = [i for (i,) in struct.iter_unpack('=f', data)]
+        return np.array(data).reshape(dims).squeeze()
+
+if __name__ == '__main__':
+    for dfn in sorted(Path('.').glob('*.dump')):
+        darr = undump(dfn)
+        print(f'{dfn}: {darr.shape}\n{darr}')
+

From 48a45ea435d091a8465d0b4daf5e9ebdcebf0802 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 30 Aug 2023 14:33:31 -0400
Subject: [PATCH 02/93] Remove warning which fails on windows.

---
 ggml-vulkan.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 32590d03e..9b5c01f68 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -49,10 +49,6 @@
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
 
-#ifndef __STDC_IEC_559__
-#warning Your C implementation does not seem to be IEC 559 compliant, which is required for proper Vulkan interop.
-#endif
-
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32

From 8563fa001f20f3d292778c39f4288bd6b06d2460 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 5 Sep 2023 13:42:27 -0700
Subject: [PATCH 03/93] remove dynamic deps from kompute build

should no longer have new external deps other than libvulkan

```
ubuntu@ip-172-31-1-24:~/repo/gpt4all/gpt4all-backend/build$ ldd ./libllamamodel-mainline-avxonly.so
        linux-vdso.so.1 (0x00007ffcb53bb000)
        libvulkan.so.1 => /lib/x86_64-linux-gnu/libvulkan.so.1 (0x00007f239dab5000)
        libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f239d800000)
        libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f239d719000)
        libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f239da95000)
        libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f239d400000)
        /lib64/ld-linux-x86-64.so.2 (0x00007f239dd1d000)
```
---
 kompute/src/CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index f4f8440f4..5f02ce12c 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -8,7 +8,7 @@ endif()
 
 cmake_minimum_required(VERSION 3.20)
 
-add_library(kompute Algorithm.cpp
+add_library(kompute STATIC Algorithm.cpp
     Manager.cpp
     OpAlgoDispatch.cpp
     OpMemoryBarrier.cpp
@@ -27,7 +27,8 @@ add_library(kompute::kompute ALIAS kompute)
 set_target_properties(kompute
     PROPERTIES
     VERSION ${${PROJECT_NAME}_VERSION}
-    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR})
+    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR}
+    POSITION_INDEPENDENT_CODE TRUE)
 
 # Import GNU common install directory variables
 include(GNUInstallDirs)
@@ -56,12 +57,12 @@ if(KOMPUTE_OPT_ANDROID_BUILD)
         android
         kp_logger
         kp_shader
-        fmt::fmt)
+        fmt::fmt-header-only)
 else()
     target_link_libraries(kompute PUBLIC Vulkan::Vulkan
         kp_logger
         kp_shader
-        fmt::fmt)
+        fmt::fmt-header-only)
 endif()
 
 if(KOMPUTE_OPT_BUILD_PYTHON)

From 45c8778b49184c60946718dc67cdf935c0031585 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 12 Sep 2023 12:39:38 -0400
Subject: [PATCH 04/93] Switch to a dynamic dispatch table instead of linking
 hard against libvulkan.

---
 ggml-vulkan.cpp                         | 15 +++++++--------
 ggml-vulkan.h                           |  1 +
 kompute/CMakeLists.txt                  |  2 ++
 kompute/src/CMakeLists.txt              |  2 +-
 kompute/src/Core.cpp                    |  2 --
 kompute/src/Manager.cpp                 | 13 +++++++------
 kompute/src/include/kompute/Manager.hpp |  5 +++++
 7 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 9b5c01f68..055b1124d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -123,21 +123,20 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 }
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
-    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
-    uint32_t deviceCount = physicalDevices.size();
 
     std::vector<ggml_vk_device> results;
+    if (!mgr.hasVulkan())
+        return results;
+
+    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    uint32_t deviceCount = physicalDevices.size();
 
     if (deviceCount == 0)
         return results;
 
     for (uint32_t i = 0; i < deviceCount; i++) {
-        VkPhysicalDeviceProperties properties;
-        vkGetPhysicalDeviceProperties(physicalDevices.at(i), &properties);
-
-        VkPhysicalDeviceMemoryProperties memoryProperties;
-        vkGetPhysicalDeviceMemoryProperties(physicalDevices.at(i), &memoryProperties);
-
+        VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties();
+        VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties();
         const uint32_t major = VK_VERSION_MAJOR(properties.apiVersion);
         const uint32_t minor = VK_VERSION_MINOR(properties.apiVersion);
         if (major < 1 || minor < 2)
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index ad8b41e4d..d13ed4184 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -40,6 +40,7 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
 bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
 bool ggml_vk_init_device(const ggml_vk_device &device);
 bool ggml_vk_init_device(int device);
+bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
index f89e13d1d..aa228653a 100644
--- a/kompute/CMakeLists.txt
+++ b/kompute/CMakeLists.txt
@@ -158,6 +158,8 @@ else()
     find_package(fmt REQUIRED)
 endif()
 
+add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
+
 # ####################################################
 # Preprocessor Macros
 # ####################################################
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 5f02ce12c..4179a81f2 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -59,7 +59,7 @@ if(KOMPUTE_OPT_ANDROID_BUILD)
         kp_shader
         fmt::fmt-header-only)
 else()
-    target_link_libraries(kompute PUBLIC Vulkan::Vulkan
+    target_link_libraries(kompute PUBLIC
         kp_logger
         kp_shader
         fmt::fmt-header-only)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
index 60849a3ec..9b0483232 100644
--- a/kompute/src/Core.cpp
+++ b/kompute/src/Core.cpp
@@ -10,7 +10,6 @@
 
 #include "kompute/Core.hpp"
 
-#if VK_USE_PLATFORM_ANDROID_KHR
 #ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 /**
@@ -21,7 +20,6 @@
  **/
 VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
 #endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
 namespace kp {
 } // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 07514ed9a..2c86b6e10 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -223,20 +223,21 @@ Manager::createInstance()
     }
 #endif
 
-#if VK_USE_PLATFORM_ANDROID_KHR
-    vk::DynamicLoader dl;
+    try {
+        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
+    } catch (const std::exception & err) {
+        return;
+    }
+
     PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
     VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
     this->mInstance = std::make_shared<vk::Instance>();
     vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
 
-#if VK_USE_PLATFORM_ANDROID_KHR
     VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
-#endif // VK_USE_PLATFORM_ANDROID_KHR
 
     KP_LOG_DEBUG("Kompute Manager Instance Created");
 
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index 8fda58f84..42336f4e8 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -43,6 +43,10 @@ class Manager
         return this->mDevice.get();
     }
 
+    bool hasVulkan() const {
+        return this->mDynamicLoader.get();
+    }
+
     /**
      * Initialize a device.
      *
@@ -240,6 +244,7 @@ class Manager
     bool mFreeInstance = false;
     std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
     std::shared_ptr<vk::Device> mDevice = nullptr;
+    std::shared_ptr<vk::DynamicLoader> mDynamicLoader = nullptr;
     bool mFreeDevice = false;
 
     // -------------- ALWAYS OWNED RESOURCES

From b7e2e691d40ca0a6e8e1e1a9186e16eafde599ae Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 12 Sep 2023 13:04:55 -0400
Subject: [PATCH 05/93] Completely revamp how we do object management with the
 vulkan backend and stop using so many static objects so we can tear down and
 bring up vulkan on new devices in the same runtime.

---
 ggml-vulkan.cpp                           | 185 +++++++++++++---------
 ggml-vulkan.h                             |   1 +
 kompute/src/Algorithm.cpp                 |  26 +--
 kompute/src/Manager.cpp                   |  41 +++--
 kompute/src/include/kompute/Algorithm.hpp |   5 +-
 kompute/src/include/kompute/Manager.hpp   |  27 +++-
 6 files changed, 172 insertions(+), 113 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 055b1124d..89de70fa4 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -65,9 +65,21 @@ struct ggml_kompute_context {
     }
 };
 
+// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
+// and consolidate the init functions and simplify object lifetime management. As it currently stands,
+// we *have* to have the kompute manager no matter what for device discovery, but the kompute context
+// is only created when a device is set and vulkan is explicitly turned on.
 ggml_kompute_context *ggml_kompute_context::instance;
-
-kp::Manager mgr;
+kp::Manager *komputeManager() {
+    static kp::Manager *s_mgr = nullptr;
+    if (s_mgr && !s_mgr->hasInstance()) {
+        delete s_mgr;
+        s_mgr = nullptr;
+    }
+    if (!s_mgr)
+        s_mgr = new kp::Manager;
+    return s_mgr;
+}
 
 #ifdef __linux__
 __attribute__((constructor))
@@ -123,12 +135,11 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 }
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
-
     std::vector<ggml_vk_device> results;
-    if (!mgr.hasVulkan())
+    if (!komputeManager()->hasVulkan())
         return results;
 
-    std::vector<vk::PhysicalDevice> physicalDevices = mgr.listDevices();
+    std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
     uint32_t deviceCount = physicalDevices.size();
 
     if (deviceCount == 0)
@@ -228,22 +239,33 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 }
 
 bool ggml_vk_init_device(int device) {
-    mgr.initializeDevice(device, {},
+    komputeManager()->initializeDevice(device, {},
                          {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
                           "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
     return ggml_vk_has_device();
 }
 
+bool ggml_vk_free_device() {
+    if (!ggml_vk_has_device())
+        return false;
+    komputeManager()->destroy();
+    return true;
+}
+
+bool ggml_vk_has_vulkan() {
+    return komputeManager()->hasVulkan();
+}
+
 bool ggml_vk_has_device() {
-    return mgr.hasDevice();
+    return komputeManager()->hasDevice();
 }
 
 ggml_vk_device ggml_vk_current_device() {
-    if (!mgr.hasDevice())
+    if (!komputeManager()->hasDevice())
         return ggml_vk_device();
 
     std::vector<ggml_vk_device> devices = ggml_vk_available_devices(0);
-    ggml_vk_filterByName(devices, mgr.physicalDevice()->getProperties().deviceName);
+    ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName);
     return devices.front();
 }
 
@@ -275,7 +297,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
       descriptorPoolSizes.data());
 
     ctx->pool = std::make_shared<vk::DescriptorPool>();
-    vk::Result r = mgr.device()->createDescriptorPool(
+    vk::Result r = komputeManager()->device()->createDescriptorPool(
       &descriptorPoolInfo, nullptr, ctx->pool.get());
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating descriptor pool" << vk::to_string(r);
@@ -284,7 +306,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
 static
 void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) {
     if (ctx->pool) {
-        mgr.device()->destroy(
+        komputeManager()->device()->destroy(
           *ctx->pool,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
         ctx->pool = nullptr;
@@ -301,7 +323,7 @@ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
     bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive;
 
     vk::Buffer *vkBuffer = new vk::Buffer;
-    vk::Result r = mgr.device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
+    vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating buffer" << vk::to_string(r);
     return vkBuffer;
@@ -312,7 +334,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
 
     uint32_t memoryTypeIndex = -1;
     bool memoryTypeIndexFound = false;
-    vk::PhysicalDeviceMemoryProperties memoryProperties = mgr.physicalDevice()->getMemoryProperties();
+    vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
         if (requirements.memoryTypeBits & (1 << i)) {
             if (((memoryProperties.memoryTypes[i]).propertyFlags &
@@ -335,7 +357,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     allocInfo.allocationSize = size;
     allocInfo.memoryTypeIndex = memoryTypeIndex;
     vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
-    vk::Result r = mgr.device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
+    vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
     if (r != vk::Result::eSuccess)
         std::cerr << "Error allocating memory" << vk::to_string(r);
     return vkDeviceMemory;
@@ -346,7 +368,7 @@ size_t ggml_vk_aligned_offset(size_t offset) {
     static size_t minStorageBufferOffsetAlignment = 0;
     if (minStorageBufferOffsetAlignment == 0) {
         vk::PhysicalDeviceProperties deviceProperties;
-        deviceProperties = mgr.physicalDevice()->getProperties();
+        deviceProperties = komputeManager()->physicalDevice()->getProperties();
         vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
         minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
     }
@@ -362,12 +384,12 @@ size_t ggml_vk_aligned_offset(size_t offset) {
 
 static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
     if (memory.stagingBuffer)
-        mgr.sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
 }
 
 static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
     if (memory.stagingBuffer)
-        mgr.sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
+        komputeManager()->sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
 }
 
 ggml_vk_memory ggml_vk_allocate(size_t size) {
@@ -375,12 +397,12 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
     bool isHostVisible = false;
     {
         memory.primaryBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.primaryBuffer);
+        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
         memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        mgr.device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
+        komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
         if (isHostVisible) {
-            vk::Result r = mgr.device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+            vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
             if (r != vk::Result::eSuccess)
                 std::cerr << "Error mapping memory" << vk::to_string(r);
         }
@@ -388,13 +410,13 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
 
     if (!isHostVisible) {
         memory.stagingBuffer = ggml_vk_allocate_buffer(size);
-        vk::MemoryRequirements memoryRequirements = mgr.device()->getBufferMemoryRequirements(*memory.stagingBuffer);
+        vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible |
                                                       vk::MemoryPropertyFlagBits::eHostCoherent |
                                                       vk::MemoryPropertyFlagBits::eHostCached;
         memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
-        mgr.device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
-        vk::Result r = mgr.device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
+        komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0);
+        vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
         if (r != vk::Result::eSuccess)
             std::cerr << "Error mapping memory" << vk::to_string(r);
     }
@@ -405,19 +427,19 @@ ggml_vk_memory ggml_vk_allocate(size_t size) {
 
 void ggml_vk_free_memory(ggml_vk_memory &memory)
 {
-    mgr.device()->destroy(
+    komputeManager()->device()->destroy(
       *memory.primaryBuffer,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingBuffer) {
-        mgr.device()->destroy(
+        komputeManager()->device()->destroy(
           *memory.stagingBuffer,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
-    mgr.device()->freeMemory(
+    komputeManager()->device()->freeMemory(
       *memory.primaryMemory,
       (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     if (memory.stagingMemory) {
-        mgr.device()->freeMemory(
+        komputeManager()->device()->freeMemory(
           *memory.stagingMemory,
           (vk::Optional<const vk::AllocationCallbacks>)nullptr);
     }
@@ -457,7 +479,7 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
         nbytes += *alignedOffset;
     }
 
-    return mgr.tensor(
+    return komputeManager()->tensor(
         t->data,
         nelements,
         nbytes, kp::Tensor::TensorDataTypes::eFloat,
@@ -476,7 +498,7 @@ void ggml_vk_add_buffer(
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
     const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
     GGML_ASSERT(res);
-    mgr.sequence()->eval<kp::OpTensorSyncDevice>({res});
+    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
@@ -496,7 +518,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
 
     GGML_ASSERT(res);
-    mgr.sequence()->eval<kp::OpTensorSyncLocal>({res});
+    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
 }
 
 std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
@@ -537,10 +559,11 @@ void ggml_vk_add(kp::Sequence& seq,
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -567,10 +590,11 @@ void ggml_vk_addrow(kp::Sequence& seq,
         row
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -595,10 +619,11 @@ void ggml_vk_mul(kp::Sequence& seq,
         safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -625,10 +650,11 @@ void ggml_vk_mulrow(kp::Sequence& seq,
         row
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -653,10 +679,11 @@ void ggml_vk_scale(kp::Sequence& seq,
         scale
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -676,10 +703,11 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
         safe_divide(inOff, 4), safe_divide(outOff, 4),
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -729,10 +757,11 @@ void ggml_vk_soft_max(kp::Sequence& seq,
         ne00, ne01, ne02
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -761,10 +790,11 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
         (uint32_t)ne00, (uint32_t)nb01, epsilon
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -808,10 +838,11 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
         ne00, ne01
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -844,10 +875,11 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
         ne00, nb01, nb02, nb11, nb12, ne0, ne1,
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -871,10 +903,11 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
         ne00, ne10, ne0,
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -921,10 +954,11 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
         ne00, nb01, nb1
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -996,10 +1030,11 @@ void ggml_vk_rope(kp::Sequence& seq,
         nb0, nb1, nb2, nb3
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1032,10 +1067,14 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
         nb0, nb1, nb2, nb3
     };
 
-    static std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!s_algo)
-        s_algo = mgr.algorithm<float, PushConstants>(ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    static std::string unique_name = std::string(__func__) +
+                                     "_i_" + std::to_string(in_element_size) +
+                                     "_o_" + std::to_string(out_element_size);
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(unique_name))
+        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
+        s_algo = komputeManager()->getAlgorithm(unique_name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1082,7 +1121,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
     std::vector<std::shared_ptr<kp::Sequence>> sequences(n_seq);
 
     for (auto& sequence : sequences) {
-        sequence = mgr.sequence();
+        sequence = komputeManager()->sequence();
     }
     for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
         const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index d13ed4184..e1d20e388 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -40,6 +40,7 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
 bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
 bool ggml_vk_init_device(const ggml_vk_device &device);
 bool ggml_vk_init_device(int device);
+bool ggml_vk_free_device();
 bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
 ggml_vk_device ggml_vk_current_device();
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index 9c41ec90f..ea81fd97b 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -58,18 +58,6 @@ Algorithm::destroy()
         this->mPipeline = nullptr;
     }
 
-    if (this->mFreePipelineCache && this->mPipelineCache) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline cache");
-        if (!this->mPipelineCache) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline cache but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineCache,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineCache = nullptr;
-    }
-
     if (this->mFreePipelineLayout && this->mPipelineLayout) {
         KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
         if (!this->mPipelineLayout) {
@@ -317,16 +305,6 @@ Algorithm::createPipeline()
       "main",
       &specializationInfo);
 
-    static std::shared_ptr<vk::PipelineCache> globalPipelineCache = std::make_shared<vk::PipelineCache>();
-    if(!*globalPipelineCache) {
-       vk::PipelineCacheCreateInfo pipelineCacheInfo =
-         vk::PipelineCacheCreateInfo();
-      this->mPipelineCache = globalPipelineCache;
-      this->mFreePipelineCache = true;
-      this->mDevice->createPipelineCache(
-        &pipelineCacheInfo, nullptr, globalPipelineCache.get());
-    }
-
     vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
                                                shaderStage,
                                                *this->mPipelineLayout,
@@ -335,7 +313,7 @@ Algorithm::createPipeline()
 
 #ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
     vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo);
+      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo);
 
     if (pipelineResult.result != vk::Result::eSuccess) {
         throw std::runtime_error("Failed to create pipeline result: " +
@@ -347,7 +325,7 @@ Algorithm::createPipeline()
     this->mFreePipeline = true;
 #else
     vk::Pipeline pipeline =
-      this->mDevice->createComputePipeline(*globalPipelineCache, pipelineInfo)
+      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo)
         .value;
     this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
     this->mFreePipeline = true;
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2c86b6e10..2a02b7b10 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -88,15 +88,14 @@ Manager::destroy()
         this->mManagedSequences.clear();
     }
 
-    if (this->mManageResources && this->mManagedAlgorithms.size()) {
+    if (this->mManageResources && !this->mManagedAlgorithmsMap.empty()) {
         KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
-        for (const std::weak_ptr<Algorithm>& weakAlgorithm :
-             this->mManagedAlgorithms) {
-            if (std::shared_ptr<Algorithm> algorithm = weakAlgorithm.lock()) {
+        for (const auto& kv : this->mManagedAlgorithmsMap) {
+            if (std::shared_ptr<Algorithm> algorithm = kv.second) {
                 algorithm->destroy();
             }
         }
-        this->mManagedAlgorithms.clear();
+        this->mManagedAlgorithmsMap.clear();
     }
 
     if (this->mManageResources && this->mManagedTensors.size()) {
@@ -109,6 +108,18 @@ Manager::destroy()
         this->mManagedTensors.clear();
     }
 
+    if (this->mPipelineCache) {
+        KP_LOG_DEBUG("Kompute Manager Destroying pipeline cache");
+        if (!this->mPipelineCache) {
+            KP_LOG_WARN("Kompute Manager Error requested to destroy "
+                        "pipeline cache but it is null");
+        }
+        this->mDevice->destroy(
+          *this->mPipelineCache,
+          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
+        this->mPipelineCache = nullptr;
+    }
+
     if (this->mFreeDevice) {
         KP_LOG_INFO("Destroying device");
         this->mDevice->destroy(
@@ -269,12 +280,14 @@ Manager::clear()
                          end(this->mManagedTensors),
                          [](std::weak_ptr<Tensor> t) { return t.expired(); }),
           end(this->mManagedTensors));
-        this->mManagedAlgorithms.erase(
-          std::remove_if(
-            begin(this->mManagedAlgorithms),
-            end(this->mManagedAlgorithms),
-            [](std::weak_ptr<Algorithm> t) { return t.expired(); }),
-          end(this->mManagedAlgorithms));
+        for (auto it = this->mManagedAlgorithmsMap.begin();
+             it != this->mManagedAlgorithmsMap.end();) {
+            if (it->second) {
+                it = this->mManagedAlgorithmsMap.erase(it);
+            } else {
+                ++it;
+            }
+        }
         this->mManagedSequences.erase(
           std::remove_if(begin(this->mManagedSequences),
                          end(this->mManagedSequences),
@@ -452,6 +465,12 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
     }
 
     KP_LOG_DEBUG("Kompute Manager compute queue obtained");
+
+    mPipelineCache = std::make_shared<vk::PipelineCache>();
+    vk::PipelineCacheCreateInfo pipelineCacheInfo =
+        vk::PipelineCacheCreateInfo();
+    this->mDevice->createPipelineCache(
+        &pipelineCacheInfo, nullptr, mPipelineCache.get());
 }
 
 std::shared_ptr<Sequence>
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
index 90fe48fef..ef11234ee 100644
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -45,6 +45,7 @@ class Algorithm
      */
     template<typename S = float, typename P = float>
     Algorithm(std::shared_ptr<vk::Device> device,
+              vk::PipelineCache *pipelineCache,
               vk::DescriptorPool *pool,
               const std::vector<std::shared_ptr<Tensor>>& tensors = {},
               const std::vector<uint32_t>& spirv = {},
@@ -55,6 +56,7 @@ class Algorithm
         KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
 
         this->mDevice = device;
+        this->mPipelineCache = pipelineCache;
         this->mDescriptorPool = pool;
 
         if (tensors.size() && spirv.size()) {
@@ -310,8 +312,7 @@ class Algorithm
     bool mFreeShaderModule = false;
     std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
     bool mFreePipelineLayout = false;
-    std::shared_ptr<vk::PipelineCache> mPipelineCache;
-    bool mFreePipelineCache = false;
+    vk::PipelineCache *mPipelineCache = nullptr;
     std::shared_ptr<vk::Pipeline> mPipeline;
     bool mFreePipeline = false;
 
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index 42336f4e8..e910b2b81 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -39,6 +39,10 @@ class Manager
      */
     ~Manager();
 
+    bool hasInstance() const {
+        return this->mInstance.get();
+    }
+
     bool hasDevice() const {
         return this->mDevice.get();
     }
@@ -149,6 +153,7 @@ class Manager
      * @returns Shared pointer with initialised algorithm
      */
     std::shared_ptr<Algorithm> algorithm(
+      const std::string &name,
       vk::DescriptorPool *pool,
       const std::vector<std::shared_ptr<Tensor>>& tensors = {},
       const std::vector<uint32_t>& spirv = {},
@@ -157,7 +162,7 @@ class Manager
       const std::vector<float>& pushConstants = {})
     {
         return this->algorithm<>(
-          pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
+          name, pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
     }
 
     /**
@@ -176,6 +181,7 @@ class Manager
      */
     template<typename S = float, typename P = float>
     std::shared_ptr<Algorithm> algorithm(
+      const std::string &name,
       vk::DescriptorPool *pool,
       const std::vector<std::shared_ptr<Tensor>>& tensors,
       const std::vector<uint32_t>& spirv,
@@ -188,6 +194,7 @@ class Manager
 
         std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
           this->mDevice,
+          mPipelineCache.get(),
           pool,
           tensors,
           spirv,
@@ -196,12 +203,24 @@ class Manager
           pushConstants) };
 
         if (this->mManageResources) {
-            this->mManagedAlgorithms.push_back(algorithm);
+            this->mManagedAlgorithmsMap.insert({name, algorithm});
         }
 
         return algorithm;
     }
 
+    bool hasAlgorithm(const std::string &name) const {
+        return mManagedAlgorithmsMap.find(name) != mManagedAlgorithmsMap.end();
+    }
+
+    std::shared_ptr<Algorithm> getAlgorithm(const std::string &name) const {
+        auto it = mManagedAlgorithmsMap.find(name);
+        if (it != mManagedAlgorithmsMap.end()) {
+            return it->second;
+        }
+        return nullptr;
+    }
+
     /**
      * Destroy the GPU resources and all managed resources by manager.
      **/
@@ -237,6 +256,7 @@ class Manager
 
     std::shared_ptr<vk::Device> device() const { return mDevice; }
     std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
+    std::shared_ptr<vk::PipelineCache> pipelineCache() const { return mPipelineCache; }
 
   private:
     // -------------- OPTIONALLY OWNED RESOURCES
@@ -250,10 +270,11 @@ class Manager
     // -------------- ALWAYS OWNED RESOURCES
     std::vector<std::weak_ptr<Tensor>> mManagedTensors;
     std::vector<std::weak_ptr<Sequence>> mManagedSequences;
-    std::vector<std::weak_ptr<Algorithm>> mManagedAlgorithms;
+    std::unordered_map<std::string, std::shared_ptr<Algorithm>> mManagedAlgorithmsMap;
 
     std::vector<uint32_t> mComputeQueueFamilyIndices;
     std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
+    std::shared_ptr<vk::PipelineCache> mPipelineCache;
 
     bool mManageResources = false;
 

From beee57266f701ac75d41c25198e05a7d40a6dfd5 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 12 Sep 2023 12:36:13 -0700
Subject: [PATCH 06/93] Make kompute actually include external SDK headers when
 requested

---
 kompute/src/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 4179a81f2..329f9bf93 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -73,6 +73,8 @@ endif()
 
 if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
     target_link_libraries(kompute PUBLIC Vulkan-Headers)
+else()
+    target_link_libraries(kompute PUBLIC Vulkan::Headers)
 endif()
 
 # ####################################################

From 68cf1df6fba8a6f0ef4a8751133ac37b0963dd30 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 10:32:43 -0400
Subject: [PATCH 07/93] Throw an exception when allocation fails for vulkan.

---
 ggml-vulkan.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 89de70fa4..c7bb3ed2b 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -325,7 +325,7 @@ vk::Buffer *ggml_vk_allocate_buffer(size_t size) {
     vk::Buffer *vkBuffer = new vk::Buffer;
     vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer);
     if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating buffer" << vk::to_string(r);
+        std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl;
     return vkBuffer;
 }
 
@@ -358,8 +358,10 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     allocInfo.memoryTypeIndex = memoryTypeIndex;
     vk::DeviceMemory *vkDeviceMemory =  new vk::DeviceMemory;
     vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
-    if (r != vk::Result::eSuccess)
-        std::cerr << "Error allocating memory" << vk::to_string(r);
+    if (r != vk::Result::eSuccess) {
+        std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
+        throw std::runtime_error("Error allocating vulkan memory.");
+    }
     return vkDeviceMemory;
 }
 

From 8bea7198792206f283a652bafe5b73686490ce01 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 13 Sep 2023 09:51:40 -0700
Subject: [PATCH 08/93] vulkan: disambiguate gpus with the same name

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c7bb3ed2b..378f1d6e6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -145,6 +145,8 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     if (deviceCount == 0)
         return results;
 
+    std::unordered_map<std::string, size_t> count_by_name;
+
     for (uint32_t i = 0; i < deviceCount; i++) {
         VkPhysicalDeviceProperties properties = physicalDevices.at(i).getProperties();
         VkPhysicalDeviceMemoryProperties memoryProperties = physicalDevices.at(i).getMemoryProperties();
@@ -173,6 +175,10 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
         d.type = properties.deviceType;
         d.heapSize = heapSize;
         d.name = properties.deviceName;
+        size_t n_idx = ++count_by_name[d.name];
+        if (n_idx > 1) {
+            d.name += " (" + std::to_string(n_idx) + ")";
+        }
         d.vendor = ggml_vk_getVendorName(properties.vendorID);
         results.push_back(d);
     }

From bd5f6399bb7ae8068e83895356f125a6d8ee513b Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 17:04:47 -0400
Subject: [PATCH 09/93] Don't try and install kompute artifacts.

---
 kompute/src/CMakeLists.txt         | 4 ++--
 kompute/src/include/CMakeLists.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index 329f9bf93..b5c3879af 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -46,8 +46,8 @@ configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
     "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
     INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
 
-install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
-    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
+#install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
+#    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
 
 # ####################################################
 # Linking
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
index 05e1ed5e1..313f48311 100644
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@@ -29,7 +29,7 @@ target_sources(kompute PRIVATE
     kompute/logger/Logger.hpp
 )
 
-install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+#install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 # ####################################################
 # Logger
@@ -43,4 +43,4 @@ target_sources(kp_logger PRIVATE
     kompute/logger/Logger.hpp
 )
 
-install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
+#install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file

From 4ed25b2f88e49b48677c100c03cc3d7159782075 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 13 Sep 2023 20:47:40 -0400
Subject: [PATCH 10/93] Sync from device back to host at begin of new prompt.

---
 llama.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index c835c6fd4..45db293be 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3820,6 +3820,10 @@ static bool llama_eval_internal(
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
+        if (lctx.ctx_kompute) {
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
         if (lctx.ctx_kompute) {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);

From 68aca6be08f05e6a3b66f58fd3c6eb69a0bbb0ca Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 14 Sep 2023 09:58:28 -0400
Subject: [PATCH 11/93] Only use vulkan with known quant that work.

---
 ggml-vulkan.cpp | 72 +++++++++++++++++++++++++------------------------
 ggml-vulkan.h   |  1 +
 2 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 378f1d6e6..36cf0b8ae 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -54,22 +54,17 @@
 #define QK4_1 32
 
 typedef ggml_fp16_t half;
-
 struct ggml_kompute_context {
     bool hasH2DAll = false;
     std::vector<ggml_vk_memory> buffers;
     std::shared_ptr<vk::DescriptorPool> pool;
-    static ggml_kompute_context *instance;
-    ggml_kompute_context() {
-        instance = this;
-    }
 };
 
 // FIXME: It would be good to consolidate the kompute manager and the kompute context into one object
 // and consolidate the init functions and simplify object lifetime management. As it currently stands,
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
-ggml_kompute_context *ggml_kompute_context::instance;
+ggml_kompute_context *s_kompute_context = nullptr;
 kp::Manager *komputeManager() {
     static kp::Manager *s_mgr = nullptr;
     if (s_mgr && !s_mgr->hasInstance()) {
@@ -266,6 +261,10 @@ bool ggml_vk_has_device() {
     return komputeManager()->hasDevice();
 }
 
+bool ggml_vk_using_vulkan() {
+    return s_kompute_context != nullptr;
+}
+
 ggml_vk_device ggml_vk_current_device() {
     if (!komputeManager()->hasDevice())
         return ggml_vk_device();
@@ -276,7 +275,8 @@ ggml_vk_device ggml_vk_current_device() {
 }
 
 ggml_kompute_context *ggml_vk_init() {
-    return new ggml_kompute_context;
+    s_kompute_context = new ggml_kompute_context;
+    return s_kompute_context;
 }
 
 bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
@@ -284,6 +284,8 @@ bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
 }
 
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
+    assert(ctx == s_kompute_context);
+    s_kompute_context = nullptr;
     delete ctx;
 }
 
@@ -569,13 +571,13 @@ void ggml_vk_add(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -600,13 +602,13 @@ void ggml_vk_addrow(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -629,13 +631,13 @@ void ggml_vk_mul(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -660,13 +662,13 @@ void ggml_vk_mulrow(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -689,13 +691,13 @@ void ggml_vk_scale(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -713,13 +715,13 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -767,13 +769,13 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -800,13 +802,13 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -848,13 +850,13 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -885,13 +887,13 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -913,13 +915,13 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -964,13 +966,13 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -1040,13 +1042,13 @@ void ggml_vk_rope(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
@@ -1080,13 +1082,13 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
                                      "_o_" + std::to_string(out_element_size);
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(unique_name))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, ggml_kompute_context::instance->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+        s_algo = komputeManager()->algorithm<float, PushConstants>(unique_name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
     else {
         s_algo = komputeManager()->getAlgorithm(unique_name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(ggml_kompute_context::instance->pool.get());
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index e1d20e388..614959ba8 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -43,6 +43,7 @@ bool ggml_vk_init_device(int device);
 bool ggml_vk_free_device();
 bool ggml_vk_has_vulkan();
 bool ggml_vk_has_device();
+bool ggml_vk_using_vulkan();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
 bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);

From addac252939a6e03e6d2b9fe8f840b5da66c89d4 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 14 Sep 2023 16:38:28 -0400
Subject: [PATCH 12/93] Set the singleton to nullptr here.

---
 ggml-vulkan.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 36cf0b8ae..a008ed3fb 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -250,6 +250,10 @@ bool ggml_vk_free_device() {
     if (!ggml_vk_has_device())
         return false;
     komputeManager()->destroy();
+    // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
+    // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
+    // is very brittle
+    s_kompute_context = nullptr;
     return true;
 }
 

From 2c24d67e7b78e07390c247340f67300523033194 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Sat, 16 Sep 2023 12:17:29 -0400
Subject: [PATCH 13/93] Don't crash on available devices if we can't even
 create an instance.

---
 ggml-vulkan.cpp         | 2 +-
 kompute/src/Manager.cpp | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a008ed3fb..c64fde832 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -131,7 +131,7 @@ static std::string ggml_vk_getVendorName(uint32_t vendorID) {
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     std::vector<ggml_vk_device> results;
-    if (!komputeManager()->hasVulkan())
+    if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
 
     std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2a02b7b10..2a3ad2cc9 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -245,8 +245,15 @@ Manager::createInstance()
     VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
 
     this->mInstance = std::make_shared<vk::Instance>();
-    vk::createInstance(
+    vk::Result r = vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
+    if (r != vk::Result::eSuccess) {
+        KP_LOG_ERROR(
+          "Kompute Manager Error allocating vulkan instance", vk::to_string(r));
+        this->mInstance = nullptr;
+        this->mFreeInstance = false;
+        return;
+    }
 
     VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
 

From 1b1416d7b73f4e857ed931eac7445d259b861fb2 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 21 Sep 2023 12:39:33 -0400
Subject: [PATCH 14/93] Support for gguf.

---
 llama.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 45db293be..e8ca52d5f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -514,6 +514,9 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_buffer {
     void * data = NULL;
     size_t size = 0;
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_memory memory;
+#endif
 
     // fallback to malloc / free
     // useful in cases where CUDA can try to allocate PINNED memory
@@ -522,6 +525,14 @@ struct llama_buffer {
     void resize(size_t n) {
         llama_host_free(data);
 
+#if defined(GGML_USE_KOMPUTE)
+        if (ggml_vk_has_device()) {
+            this->memory = ggml_vk_allocate(n);
+            this->data = (uint8_t*)memory.data;
+            this->size = n;
+            return;
+        }
+#endif
         data = llama_host_malloc(n);
         if (!data) {
             fallback = true;
@@ -536,6 +547,13 @@ struct llama_buffer {
 
     ~llama_buffer() {
         if (data) {
+#if defined(GGML_USE_KOMPUTE)
+            if (ggml_vk_has_device()) {
+                ggml_vk_free_memory(memory);
+                data = NULL;
+                return;
+            }
+#endif
             if (fallback) { // NOLINT
                 free(data);
             } else {
@@ -1398,6 +1416,9 @@ struct llama_model_loader {
             use_mmap = false;
         }
 
+#if defined(GGML_USE_KOMPUTE)
+        use_mmap = false;
+#endif
         this->use_mmap = use_mmap;
     }
 
@@ -6470,6 +6491,23 @@ struct llama_context * llama_new_context_with_model(
             LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
 #undef LLAMA_METAL_CHECK_BUF
         }
+#elif defined(GGML_USE_KOMPUTE)
+    if (ggml_vk_has_device() && params.n_gpu_layers > 0
+        && (model->ftype == LLAMA_FTYPE_ALL_F32
+            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+        // this allocates all Vulkan resources and memory buffers
+        ctx->ctx_kompute = ggml_vk_init();
+
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+        ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
+        ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
+    }
 #endif
     }
 
@@ -6503,7 +6541,13 @@ static struct llama_context * llama_init_from_file(
 }
 
 void llama_free(struct llama_context * ctx) {
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free(ctx->ctx_kompute);
+#endif
     delete ctx;
+#ifdef GGML_USE_KOMPUTE
+    ggml_vk_free_device();
+#endif
 }
 
 int llama_n_vocab(const struct llama_context * ctx) {

From 6b6c73a9e3b299227cd5b51552f10e5d102810d4 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Tue, 26 Sep 2023 10:35:05 -0400
Subject: [PATCH 15/93] kompute : don't fail build because of -Warray-bounds

There are some warnings in debug builds that are likely to be false
positives.
---
 kompute/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
index aa228653a..1bd84d7ed 100644
--- a/kompute/CMakeLists.txt
+++ b/kompute/CMakeLists.txt
@@ -169,7 +169,7 @@ endif()
 
 if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror -Wno-error=array-bounds")
 endif()
 
 # If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h

From 9e4f8b4acc387f3c0f0cdb62c2582dc01a67caad Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 26 Sep 2023 11:58:39 -0400
Subject: [PATCH 16/93] Upload immediately to device.

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index e8ca52d5f..1432696bd 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2787,7 +2787,7 @@ static struct ggml_cgraph * llm_build_llama(
     ggml_free(ctx0);
 
 #if defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && N == 1) {
+    if (lctx.ctx_kompute) {
         if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {

From 77135a3bf506d4ed782f5ea93ae6f3f61b056117 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 21 Sep 2023 13:00:10 -0400
Subject: [PATCH 17/93] Add a common boilerplate code via include and elim copy
 pasta

---
 CMakeLists.txt               |   2 +-
 kompute/common.comp          | 124 +++++++++++++++++++++++++++++++++++
 kompute/op_add.comp          | 117 +--------------------------------
 kompute/op_addrow.comp       | 117 +--------------------------------
 kompute/op_cpy_f16_f16.comp  | 117 +--------------------------------
 kompute/op_cpy_f16_f32.comp  | 117 +--------------------------------
 kompute/op_cpy_f32_f16.comp  | 117 +--------------------------------
 kompute/op_cpy_f32_f32.comp  | 117 +--------------------------------
 kompute/op_diagmask.comp     | 117 +--------------------------------
 kompute/op_gelu.comp         | 117 +--------------------------------
 kompute/op_getrows_f16.comp  | 117 +--------------------------------
 kompute/op_getrows_q4_0.comp | 117 +--------------------------------
 kompute/op_getrows_q4_1.comp | 117 +--------------------------------
 kompute/op_mul.comp          | 117 +--------------------------------
 kompute/op_mul_mat_f16.comp  | 117 +--------------------------------
 kompute/op_mul_mat_q4_0.comp | 117 +--------------------------------
 kompute/op_mul_mat_q4_1.comp | 117 +--------------------------------
 kompute/op_mulrow.comp       | 117 +--------------------------------
 kompute/op_norm.comp         | 117 +--------------------------------
 kompute/op_relu.comp         | 117 +--------------------------------
 kompute/op_rmsnorm.comp      | 117 +--------------------------------
 kompute/op_rope.comp         | 117 +--------------------------------
 kompute/op_scale.comp        | 116 +-------------------------------
 kompute/op_silu.comp         | 117 +--------------------------------
 kompute/op_softmax.comp      | 117 +--------------------------------
 25 files changed, 148 insertions(+), 2668 deletions(-)
 create mode 100644 kompute/common.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88585fb93..31532df91 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -429,7 +429,7 @@ if (LLAMA_KOMPUTE)
         set(spv_file ${source}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
             COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
         )
diff --git a/kompute/common.comp b/kompute/common.comp
new file mode 100644
index 000000000..12fc7d8b5
--- /dev/null
+++ b/kompute/common.comp
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#extension GL_EXT_shader_16bit_storage: require
+#extension GL_EXT_shader_8bit_storage: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
+#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
+#extension GL_EXT_control_flow_attributes: enable
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+#define GELU_COEF_A 0.044715
+#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+#define BM 128
+#define BN 128
+#define BK 8
+#define TM 8
+#define TN 8
+
+#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
+#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
+#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
+#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
+
+#define sizeof_block_q4_0 0x12
+#define sizeof_block_q4_1 0x14
+struct block_q4_0 {
+    float16_t d;
+    uint8_t qs[QK4_0 / 2];
+};
+struct block_q4_1 {
+    float16_t d;
+    float16_t m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+#ifndef QK_K
+#define QK_K 256
+#endif
+
+#if QK_K == 256
+#define K_SCALE_SIZE 12
+#else
+#define K_SCALE_SIZE 4
+#endif
+
+struct block_q2_K {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    float16_t d;           // super-block scale for quantized scales
+    float16_t dmin;        // super-block scale for quantized mins
+};
+// 84 bytes / block
+
+struct block_q3_K {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#if QK_K == 64
+    uint8_t scales[2];
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    float16_t d;             // super-block scale
+};
+
+#if QK_K == 64
+typedef struct {
+    float16_t    d[2];          // super-block scales/mins
+    uint8_t scales[2];
+    uint8_t qs[QK_K/2];    // 4-bit quants
+} block_q4_K;
+#else
+struct block_q4_K {
+    float16_t d;             // super-block scale for quantized scales
+    float16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+};
+#endif
+
+#if QK_K == 64
+struct block_q5_K {
+    float16_t  d;                     // super-block scales/mins
+    int8_t  scales[QK_K/16];     // 8-bit block scales
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+#else
+struct block_q5_K {
+    float16_t d;                      // super-block scale for quantized scales
+    float16_t dmin;                   // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+};
+// 176 bytes / block
+#endif
+
+struct block_q6_K {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    float16_t d;                  // super-block scale
+};
+// 210 bytes / block
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 7e4e43d75..019a68449 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index 492f672e5..926c929e4 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 40d756ae5..5f425ae28 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float16_t
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index 309c48aed..4298bebdd 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float16_t
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index fb0e00d67..2d763edfd 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
index f43480b8d..4e5b1d393 100644
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -1,121 +1,6 @@
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 #define IN_TYPE float
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
index 18b0192d7..8dc2cc60a 100644
--- a/kompute/op_diagmask.comp
+++ b/kompute/op_diagmask.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index 8079b8ef2..c9f8ce3cf 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index e0f5bb16e..17b478b5e 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index cddba929b..590f218e6 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 151848a9d..44718c6af 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 4907015d8..348eae7b3 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index f1198b593..1390c00cf 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 64) in;
 
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 206aea7d5..9b6dd72dc 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index 8bdf810a1..fb7b051b8 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 8, local_size_y = 8) in;
 
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 3defd0a5f..498dbdfcd 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index ec0a8568d..4b2db25e3 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 256
 
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index bc2c31f43..41f46be96 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 784713c36..dd2c5cdde 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 256
 
diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
index ca6bb6831..3fa84f579 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index f537121a4..8530aaf3e 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -8,122 +8,8 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
+#include "common.comp"
 
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
 layout(local_size_x = 1) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 90c034ac7..c5acac281 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 layout(local_size_x = 1) in;
 
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index ce0e71924..e936d8f68 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -8,122 +8,7 @@
 
 #version 450
 
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
+#include "common.comp"
 
 #define nth 32
 

From 93306f16d046831a750d9971be3b720ef0ef8136 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 29 Sep 2023 10:02:22 -0400
Subject: [PATCH 18/93] Consolidate code for mat x vec kernels and use
 subgroups more extensively.

---
 ggml-vulkan.cpp                               |  91 ++++++++++------
 ggml-vulkan.h                                 |   1 +
 kompute/op_getrows_q4_1.comp                  |   2 +-
 kompute/op_mul_mat_f16.comp                   |  29 ++---
 kompute/op_mul_mat_q4_0.comp                  |  77 +++++--------
 kompute/op_mul_mat_q4_1.comp                  | 102 ++++++------------
 kompute/op_mul_mv_q_n.comp                    |  49 +++++++++
 kompute/op_softmax.comp                       |  54 +++-------
 kompute/src/CMakeLists.txt                    |   1 +
 kompute/src/OpTensorFill.cpp                  |  55 ++++++++++
 kompute/src/Tensor.cpp                        |   7 ++
 kompute/src/include/CMakeLists.txt            |   1 +
 kompute/src/include/kompute/Kompute.hpp       |   1 +
 kompute/src/include/kompute/Tensor.hpp        |   4 +
 .../kompute/operations/OpTensorFill.hpp       |  58 ++++++++++
 llama.cpp                                     |   3 +-
 16 files changed, 321 insertions(+), 214 deletions(-)
 create mode 100644 kompute/op_mul_mv_q_n.comp
 create mode 100644 kompute/src/OpTensorFill.cpp
 create mode 100644 kompute/src/include/kompute/operations/OpTensorFill.hpp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index c64fde832..74dd0f00f 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -165,11 +165,20 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
         if (heapSize < memoryRequired)
             continue;
 
+        vk::PhysicalDeviceSubgroupProperties subgroupProperties;
+        vk::PhysicalDeviceProperties2 deviceProperties2;
+        deviceProperties2.pNext = &subgroupProperties;
+        physicalDevices.at(i).getProperties2(&deviceProperties2);
+
+        if (subgroupProperties.subgroupSize < 32)
+            continue;
+
         ggml_vk_device d;
         d.index = i;
         d.type = properties.deviceType;
         d.heapSize = heapSize;
         d.name = properties.deviceName;
+        d.subgroupSize = subgroupProperties.subgroupSize;
         size_t n_idx = ++count_by_name[d.name];
         if (n_idx > 1) {
             d.name += " (" + std::to_string(n_idx) + ")";
@@ -242,7 +251,7 @@ bool ggml_vk_init_device(const ggml_vk_device &device) {
 bool ggml_vk_init_device(int device) {
     komputeManager()->initializeDevice(device, {},
                          {"VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage",
-                          "VK_KHR_16bit_storage", "VK_KHR_storage_buffer_storage_class"});
+                          "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info"});
     return ggml_vk_has_device();
 }
 
@@ -772,9 +781,10 @@ void ggml_vk_soft_max(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
@@ -890,9 +900,10 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
@@ -907,26 +918,28 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
                           uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                          int32_t ne00, int32_t ne10, int32_t ne0,
-                          int32_t ne01, int32_t ne11) {
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne10, ne0;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
     } pushConsts {
         safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne10, ne0,
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11)}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11)});
+        s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
+    seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -1182,7 +1195,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const uint32_t nb3 = dst ? dst->nb[3] : 0;
 
             const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-//            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
             const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
 
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
@@ -1263,30 +1276,46 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
-                        if ((src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32)
-                            && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                        } else if (src0->type == GGML_TYPE_Q4_0
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else if (src0->type == GGML_TYPE_Q4_1
-                                   && src1->type == GGML_TYPE_F32) {
-                            ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne01, ne11);
-                        } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0->type, src1->type);
+                        if (src1t != GGML_TYPE_F32) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
                         }
+
+                        if (!ggml_is_transposed(src0)
+                            && !ggml_is_transposed(src1)
+                            && ne00%32 == 0
+                            && ne11 > 1) {
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            goto not_implemented;
+                        } else {
+                            switch (src0t) {
+                                case GGML_TYPE_F16:
+                                case GGML_TYPE_F32:
+                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    break;
+                                case GGML_TYPE_Q4_0:
+                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                case GGML_TYPE_Q4_1:
+                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
+                                default: {
+                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                    goto not_implemented;
+                                }
+                            }
+                        }
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
-                        if (src0->type == GGML_TYPE_F16) {
+                        if (src0t == GGML_TYPE_F16) {
                             ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_0) {
+                        } else if (src0t == GGML_TYPE_Q4_0) {
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
-                        } else if (src0->type == GGML_TYPE_Q4_1) {
+                        } else if (src0t == GGML_TYPE_Q4_1) {
                             ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0->type);
+                            fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
                         }
                     } break;
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 614959ba8..7989cfc1f 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -34,6 +34,7 @@ struct ggml_vk_device {
     size_t heapSize = 0;
     std::string name;
     std::string vendor;
+    int subgroupSize = 0;
 };
 
 std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 44718c6af..3d00928d3 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -43,7 +43,7 @@ void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based fro
     const uint nb = k / qk;
 
     for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
+        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
 
         const float16_t d = block.d;
         const float16_t m = block.m;
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index 1390c00cf..72a667f92 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -10,7 +10,9 @@
 
 #include "common.comp"
 
-layout(local_size_x = 64) in;
+#extension GL_KHR_shader_subgroup_arithmetic : require
+
+layout(local_size_x_id = 0) in;
 
 layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -29,8 +31,6 @@ layout (push_constant) uniform parameter {
     int ne1;
 } pcs;
 
-shared float sum[gl_WorkGroupSize.x];
-
 void main() {
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -39,24 +39,13 @@ void main() {
     const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
     const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
 
-    sum[gl_LocalInvocationID.x] = 0.0;
-
-    for (uint i = gl_LocalInvocationID.x; i < pcs.ne00; i += gl_WorkGroupSize.x) {
-        sum[gl_LocalInvocationID.x] += float(inA[x+i]) * float(inB[y+i]);
+    float sumf = 0.0f;
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+        sumf += float(inA[x+i]) * float(inB[y+i]);
     }
 
-    // accumulate the sum from all threads in the threadgroup
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    if (gl_LocalInvocationID.x == 0) {
-        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = sum[0];
+    const float all_sum = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
     }
 }
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 9b6dd72dc..165df3c37 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -10,7 +10,13 @@
 
 #include "common.comp"
 
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_0
+#define SIZE_OF_BLOCK sizeof_block_q4_0
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -23,58 +29,31 @@ layout (push_constant) uniform parameter {
     int ne00;
     int ne10;
     int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;
 
-shared float sum[64];
+// The q4_0 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
+    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 2 + il + i);
 
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_0);
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
 
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
+        sumy += yl0 + yl1 + yl8 + yl9;
 
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
-
-    float sumf = 0.0;
-
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        const uint index = (x+i)*sizeof_block_q4_0+pcs.inAOff;
-        const float d = float(u8BufToFloat16(inA, index));
-
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_0 + first; // Based from inB
-
-        vec2 acc = vec2(0.0, 0.0);
-
-        for (int j = 0; j < 4; ++j) {
-            const uint8_t b = inA[index+2+xl+j];
-            acc.x += inB[yl+j] * (b & 0xF) + inB[yl+j+16] * (b >> 4);
-            acc.y += inB[yl+j] + inB[yl+j+16];
-        }
-
-        sumf += d * (acc.x - 8.*acc.y);
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    if (ith == 0) {
-        float sumTotal = 0.0;
-        for (uint i = 0; i < nth; ++i) {
-            sumTotal += sum[i];
-        }
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sumTotal;
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
     }
+    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
+
+#include "op_mul_mv_q_n.comp"
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index fb7b051b8..683b695ca 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -10,7 +10,13 @@
 
 #include "common.comp"
 
-layout(local_size_x = 8, local_size_y = 8) in;
+#define BLOCKS_IN_QUANT QK4_1
+#define SIZE_OF_BLOCK sizeof_block_q4_1
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout (binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -23,81 +29,33 @@ layout (push_constant) uniform parameter {
     int ne00;
     int ne10;
     int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
 } pcs;
 
-shared float sum[gl_WorkGroupSize.x*gl_WorkGroupSize.y];
+// The q4_1 version of this function
+float block_q_n_dot_y(uint block_index, uint yb, uint il) {
+    vec2 acc = vec2(0.0, 0.0);
+    const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff;
+    float d = float(u8BufToFloat16(inA, index));
+    float m = float(u8BufToFloat16(inA, index+2));
 
-#define UNALIGNED_INPUT inA
+    float sumy = 0.0f;
+    for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) {
+        const uint16_t b = u8BufToU16(inA, index + 4 + il + i);
 
-block_q4_1 get_unaligned_block_q4_1(uint index) {
-    block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+        const float yl0 = inB[yb + i];
+        const float yl1 = inB[yb + i + 1];
+        const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2];
+        const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1];
+
+        sumy += yl0 + yl1 + yl8 + yl9;
+
+        acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00);
+        acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000);
     }
-    return fres;
+    return d * (acc[0] + acc[1]) + sumy * m;
 }
 
-void main() {
-    const uint nb = uint(pcs.ne00/QK4_1);
-
-    const uint r0 = gl_WorkGroupID.x;
-    const uint r1 = gl_WorkGroupID.y;
-
-    const uint x = r0*nb; // Based from inA without base offset
-    const uint y = r1*uint(pcs.ne10) + pcs.inBOff; // Based from inB
-
-    const uint nth = gl_WorkGroupSize.x*gl_WorkGroupSize.y;
-    const uint ith = gl_WorkGroupSize.y*gl_LocalInvocationID.x + gl_LocalInvocationID.y;
-
-    const uint ix = gl_LocalInvocationID.y/4;           // 0 or 1
-    const uint iy = gl_LocalInvocationID.y - 4*ix;      // 0...3
-
-    const uint first = 4 * iy;
-
-    float sumf = 0.0;
-
-    for (uint i = 2*gl_LocalInvocationID.x + ix; i < nb; i += 2*gl_WorkGroupSize.x) {
-        //TODO: Removing the use of pointers has been quite hairy here. If something goes wrong here, this is most likely it:
-
-        const block_q4_1 block = get_unaligned_block_q4_1((x+i)*sizeof_block_q4_1+pcs.inAOff);
-
-        const float d = float(block.d);
-        const float m = float(block.m);
-
-        const uint xl = first; // Based from bl->qs
-        const uint yl = y + i * QK4_1 + first; // Based from inB
-
-        vec2 acc = vec2(0.0, 0.0);
-
-        for (int j = 0; j < 4; ++j) {
-            acc.x += inB[yl+j] * (d * (block.qs[xl+j] & 0xF) + m);
-            acc.y += inB[yl+j+16] * (d * (block.qs[xl+j] >> 4) + m);
-        }
-
-        sumf += d * (acc.x - acc.y);
-    }
-
-    sum[ith] = sumf;
-
-    //
-    // Accumulate the sum from all threads in the threadgroup
-    //
-    barrier();
-    memoryBarrierShared();
-    if (ith%4 == 0) {
-        sum[ith] += sum[ith+1] + sum[ith+2] + sum[ith+3];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith%16 == 0) {
-        sum[ith] += sum[ith+4] + sum[ith+8] + sum[ith+12];
-    }
-    barrier();
-    memoryBarrierShared();
-    if (ith == 0) {
-        for (uint i = 16; i < nth; i += 16) sum[0] += sum[i];
-        out_[r1*uint(pcs.ne0) + r0 + pcs.outOff] = sum[0];
-    }
-}
+#include "op_mul_mv_q_n.comp"
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
new file mode 100644
index 000000000..83de952dd
--- /dev/null
+++ b/kompute/op_mul_mv_q_n.comp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+void main() {
+    const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+    const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS;
+    const uint offset0 = first_row * nb + im/pcs.gqa*(nb*pcs.ne0);
+
+    const uint x = offset0; // Based from inA without base offset
+    const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+    const uint ix = gl_SubgroupInvocationID/2;
+    const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2);
+
+    uint yb = y + ix * BLOCKS_IN_QUANT + il;
+
+    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
+        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
+        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+
+    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
+        for (int row = 0; row < N_ROWS; row++) {
+            const uint block_index = x + ib + row * nb;
+            sumf[row] += block_q_n_dot_y(block_index, yb, il);
+        }
+
+        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
+    }
+
+    for (int row = 0; row < N_ROWS; ++row) {
+        const float tot = subgroupAdd(sumf[row]);
+        if (first_row + row < pcs.ne01 && subgroupElect()) {
+            out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot;
+        }
+    }
+}
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index e936d8f68..60456a3bb 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -10,9 +10,9 @@
 
 #include "common.comp"
 
-#define nth 32
+#extension GL_KHR_shader_subgroup_arithmetic : require
 
-layout(local_size_x = nth) in;
+layout(local_size_x_id = 0) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
@@ -25,8 +25,6 @@ layout(push_constant) uniform PushConstants {
     int ne02;
 } pcs;
 
-shared float buf[nth];
-
 void main() {
     const uint i03 = gl_WorkGroupID.z;
     const uint i02 = gl_WorkGroupID.y;
@@ -37,46 +35,22 @@ void main() {
     const uint pdst = extra_off + pcs.outOff; // Based from out_
 
     // parallel max
-    buf[gl_LocalInvocationID.x] = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], in_[psrc0 + i00]);
+    float localMax = uintBitsToFloat(0xFF800000);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        localMax = max(localMax, in_[psrc0 + i00]);
     }
-
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] = max(buf[gl_LocalInvocationID.x], buf[gl_LocalInvocationID.x + i]);
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    // broadcast
-    const float max_ = buf[0];
+    float max_ = subgroupMax(localMax);
 
     // parallel sum
-    buf[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        buf[gl_LocalInvocationID.x] += exp(in_[psrc0 + i00] - max_);
+    float localSum = 0.0f;
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
+        localSum += exp_psrc0;
+        out_[pdst + i00] = exp_psrc0;
     }
 
-    // reduce
-    barrier();
-    memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
-        if (gl_LocalInvocationID.x < i) {
-            buf[gl_LocalInvocationID.x] += buf[gl_LocalInvocationID.x + i];
-        }
-        barrier();
-        memoryBarrierShared();
-    }
-
-    // broadcast
-    const float sum = buf[0];
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        out_[pdst + i00] = exp(in_[psrc0 + i00] - max_) / sum;
+    const float sum = subgroupAdd(localSum);
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+        out_[pdst + i00] /= sum;
     }
 }
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
index b5c3879af..42b7d07f5 100644
--- a/kompute/src/CMakeLists.txt
+++ b/kompute/src/CMakeLists.txt
@@ -13,6 +13,7 @@ add_library(kompute STATIC Algorithm.cpp
     OpAlgoDispatch.cpp
     OpMemoryBarrier.cpp
     OpTensorCopy.cpp
+    OpTensorFill.cpp
     OpTensorSyncDevice.cpp
     OpTensorSyncLocal.cpp
     OpBufferSyncDevice.cpp
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
new file mode 100644
index 000000000..da477dcc7
--- /dev/null
+++ b/kompute/src/OpTensorFill.cpp
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "kompute/operations/OpTensorFill.hpp"
+#include "kompute/Tensor.hpp"
+
+namespace kp {
+
+OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
+
+    if (tensors.size() < 1) {
+        throw std::runtime_error(
+          "Kompute OpTensorFill called with less than 1 tensor");
+    }
+
+    this->mTensors = tensors;
+}
+
+OpTensorFill::~OpTensorFill()
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
+}
+
+void
+OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill record called");
+
+    for (size_t i = 0; i < this->mTensors.size(); i++) {
+        this->mTensors[i]->recordFill(commandBuffer, 0);
+    }
+}
+
+void
+OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
+}
+
+void
+OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
+{
+    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
+}
+
+}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
index 9c343ff13..65279206d 100644
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@@ -215,6 +215,13 @@ Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
     commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
 }
 
+void
+Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
+                   uint32_t fill)
+{
+    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
+}
+
 void
 Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                          vk::AccessFlagBits srcAccessMask,
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
index 313f48311..53e9d8ae6 100644
--- a/kompute/src/include/CMakeLists.txt
+++ b/kompute/src/include/CMakeLists.txt
@@ -21,6 +21,7 @@ target_sources(kompute PRIVATE
     kompute/operations/OpMemoryBarrier.hpp
     kompute/operations/OpMult.hpp
     kompute/operations/OpTensorCopy.hpp
+    kompute/operations/OpTensorFill.hpp
     kompute/operations/OpTensorSyncDevice.hpp
     kompute/operations/OpTensorSyncLocal.hpp
     kompute/operations/OpBufferSyncDevice.hpp
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
index f59a63b50..70e0dd433 100644
--- a/kompute/src/include/kompute/Kompute.hpp
+++ b/kompute/src/include/kompute/Kompute.hpp
@@ -15,6 +15,7 @@
 #include "operations/OpTensorSyncLocal.hpp"
 #include "operations/OpBufferSyncDevice.hpp"
 #include "operations/OpBufferSyncLocal.hpp"
+#include "operations/OpTensorFill.hpp"
 
 // Will be build by CMake and placed inside the build directory
 #include "ShaderLogisticRegression.hpp"
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
index 4c260ce6b..2ab88eb30 100644
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -126,6 +126,9 @@ class Tensor
     void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
                         std::shared_ptr<Tensor> copyFromTensor);
 
+    void recordFill(const vk::CommandBuffer &commandBuffer,
+                    uint32_t fill);
+
     /**
      * Records a copy from the internal staging memory to the device memory
      * using an optional barrier to wait for the operation. This function would
@@ -279,6 +282,7 @@ class Tensor
                           vk::Buffer *bufferTo,
                           vk::DeviceSize bufferSize,
                           vk::BufferCopy copyRegion);
+
     void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
                                    const vk::Buffer& buffer,
                                    vk::AccessFlagBits srcAccessMask,
diff --git a/kompute/src/include/kompute/operations/OpTensorFill.hpp b/kompute/src/include/kompute/operations/OpTensorFill.hpp
new file mode 100644
index 000000000..9a6bf131e
--- /dev/null
+++ b/kompute/src/include/kompute/operations/OpTensorFill.hpp
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include "kompute/Core.hpp"
+
+#include "kompute/Tensor.hpp"
+
+#include "kompute/operations/OpBase.hpp"
+
+namespace kp {
+
+/**
+ * Operation that fills the tensor
+ */
+class OpTensorFill : public OpBase
+{
+  public:
+    /**
+     * Default constructor with parameters that provides the core vulkan
+     * resources and the tensors that will be used in the operation.
+     *
+     * @param tensors Tensors that will be used to create in operation.
+     */
+    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
+
+    /**
+     * Default destructor. This class does not manage memory so it won't be
+     * expecting the parent to perform a release.
+     */
+    ~OpTensorFill() override;
+
+    /**
+     * Records the fill command for tensor.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    void record(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any preEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
+
+    /**
+     * Does not perform any postEval commands.
+     *
+     * @param commandBuffer The command buffer to record the command into.
+     */
+    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
+
+  private:
+    // -------------- ALWAYS OWNED RESOURCES
+    std::vector<std::shared_ptr<Tensor>> mTensors;
+};
+
+} // End namespace kp
diff --git a/llama.cpp b/llama.cpp
index 1432696bd..245174898 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6495,7 +6495,8 @@ struct llama_context * llama_new_context_with_model(
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 601905e75ee6cbacec0ee5aa523c96fb0258bd63 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:00:55 -0400
Subject: [PATCH 19/93] Move the subgroups and printf into common.

---
 kompute/common.comp        | 2 ++
 kompute/op_mul_mv_q_n.comp | 9 +++------
 kompute/op_softmax.comp    | 2 --
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/kompute/common.comp b/kompute/common.comp
index 12fc7d8b5..2e843a878 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -12,6 +12,8 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
 #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 #extension GL_EXT_control_flow_attributes: enable
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
 
 #define QK4_0 32
 #define QR4_0 2
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index 83de952dd..15bcbf765 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -6,9 +6,6 @@
  * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
  */
 
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
 void main() {
     const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
     const uint r0 = gl_WorkGroupID.x;
@@ -27,9 +24,9 @@ void main() {
 
     uint yb = y + ix * BLOCKS_IN_QUANT + il;
 
-    debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
-        gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
-        gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
+    //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n",
+    //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
+    //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
 
     for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
         for (int row = 0; row < N_ROWS; row++) {
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index 60456a3bb..d21577ac0 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -10,8 +10,6 @@
 
 #include "common.comp"
 
-#extension GL_KHR_shader_subgroup_arithmetic : require
-
 layout(local_size_x_id = 0) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };

From 5509f743187f69624fc617faeefc82c175d33e57 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:01:45 -0400
Subject: [PATCH 20/93] Minor cleanup.

---
 ggml-vulkan.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 74dd0f00f..f770a2d0c 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -939,7 +939,6 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
-    seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -951,7 +950,6 @@ void ggml_vk_mul_mat_q4_0(Args&&... args) {
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
-// FIXME: This could be improved like was done in q4_0 version but needs testing...
 template <typename... Args>
 void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,

From 4b223ec4329a24f3b932ea1a9c0456ef11b851ea Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:04:02 -0400
Subject: [PATCH 21/93] Refactor getrows to use common code and get ready for
 q6_k.

---
 kompute/common.comp          | 138 +++++++++++++++--------------------
 kompute/op_getrows.comp      |  25 +++++++
 kompute/op_getrows_f16.comp  |  10 ++-
 kompute/op_getrows_q4_0.comp |  38 +++-------
 kompute/op_getrows_q4_1.comp |  41 +++--------
 5 files changed, 111 insertions(+), 141 deletions(-)
 create mode 100644 kompute/op_getrows.comp

diff --git a/kompute/common.comp b/kompute/common.comp
index 2e843a878..040b87375 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -16,27 +16,12 @@
 #extension GL_EXT_debug_printf : enable
 
 #define QK4_0 32
-#define QR4_0 2
 #define QK4_1 32
 
 #define GELU_COEF_A 0.044715
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
 
-#ifndef QK_K
 #define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
 
 #define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
 #define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
@@ -44,83 +29,76 @@
 #define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
 
 #define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
 struct block_q4_0 {
     float16_t d;
     uint8_t qs[QK4_0 / 2];
 };
+mat4 dequantize_q4_0(const block_q4_0 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float md = -8.f * xb.d;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
+
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md;
+        reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md;
+    }
+    return reg;
+}
+
+#define sizeof_block_q4_1 0x14
 struct block_q4_1 {
     float16_t d;
     float16_t m;
     uint8_t qs[QK4_1 / 2];
 };
+mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
+    const float d1 = il != 0 ? (xb.d / 16.f) : xb.d;
+    const float d2 = d1 / 256.f;
+    const float  m = xb.m;
+    const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F);
+    const uint16_t mask1 = mask0 << 8;
 
-#ifndef QK_K
-#define QK_K 256
-#endif
+    mat4 reg;
+    for (int i=0;i<8;i++) {
+        uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]);
+        reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m;
+        reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m;
+    }
+    return reg;
+}
 
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
+#define sizeof_block_q6_k 210
+struct block_q6_k {
     uint8_t ql[QK_K/2];      // quants, lower 4 bits
     uint8_t qh[QK_K/4];      // quants, upper 2 bits
     int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
+    float16_t d;             // super-block scale
 };
-// 210 bytes / block
+mat4 dequantize_q6_k(const block_q6_k xb, uint il) {
+    const float16_t d_all = xb.d;
+    uint8_t ql[QK_K/2];
+    uint8_t qh[QK_K/4];
+    int8_t  scales[QK_K/16];
+
+    const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1);
+    const uint qhIndex = 32*(il/8) + 16*(il&1);
+    float16_t sc = xb.scales[(il%2) + 2 * ((il/2))];
+    il = (il/2) & 3;
+
+    const uint16_t  kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3);
+    const uint16_t  kmask2 = il>1 ? uint8_t(0xF0)             : uint8_t(0x0F);
+    const float16_t coef   = il>1 ? float16_t(1.f/16.f)       : float16_t(1.f);
+    const float16_t ml = float16_t(d_all * sc * 32.f);
+    const float16_t dl = float16_t(d_all * sc * coef);
+    mat4 reg;
+    for (int i = 0; i < 16; ++i) {
+        const float16_t q = (il&1) != 0 ? ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 2))
+                                        : ((ql[qlIndex + i] & kmask2) | ((qh[qhIndex + i] & kmask1) << 4));
+        reg[i/4][i%4] = dl * q - ml;
+    }
+    return reg;
+}
diff --git a/kompute/op_getrows.comp b/kompute/op_getrows.comp
new file mode 100644
index 000000000..a4d8bb9a0
--- /dev/null
+++ b/kompute/op_getrows.comp
@@ -0,0 +1,25 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+void main() {
+    const uint i = gl_WorkGroupID.x;
+    const int r = inB[i + pcs.inBOff];
+
+    int z = 0;
+    for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) {
+        const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK;
+        const mat4 result = dequantize_block(inIndex, ind%NL);
+        for (uint j = 0; j < 4; ++j) {
+            for (uint k = 0; k < 4; ++k) {
+                const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z;
+                out_[outIndex] = result[j][k];
+                ++z;
+            }
+        }
+    }
+}
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index 17b478b5e..3f2b16724 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -25,11 +25,15 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
+void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
+    for (int j = 0; j < k; j++) {
+        out_[y + j] = inA[x + j];
+    }
+}
+
 void main() {
     const uint i = gl_WorkGroupID.x;
     const int r = inB[i + pcs.inBOff];
 
-    for (int j = 0; j < pcs.ne00; j++) {
-        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
-    }
+    dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1 + pcs.outOff, pcs.ne00);
 }
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index 590f218e6..0449b1987 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -10,6 +10,10 @@
 
 #include "common.comp"
 
+#define NL 2
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q4_0
+
 layout(local_size_x = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -25,40 +29,18 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
-#define UNALIGNED_INPUT inA
-
 block_q4_0 get_unaligned_block_q4_0(uint index) {
     block_q4_0 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
+    fres.d = u8BufToFloat16(inA, index);
     [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
+        fres.qs[it] = inA[index+2+it];
     }
     return fres;
 }
 
-void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_0;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
-
-        const float16_t d = block.d;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F) - 8;
-            const int x1 = (block.qs[j] >>   4) - 8;
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d;
-            out_[y+i*qk + j + qk/2] = float(x1)*d;
-        }
-    }
+mat4 dequantize_block(uint index, uint il) {
+    const block_q4_0 block = get_unaligned_block_q4_0(index);
+    return dequantize_q4_0(block, il);
 }
 
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
+#include "op_getrows.comp"
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 3d00928d3..64586cdc9 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -10,6 +10,10 @@
 
 #include "common.comp"
 
+#define NL 2
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q4_1
+
 layout(local_size_x = 1) in;
 
 layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -25,42 +29,19 @@ layout (push_constant) uniform parameter {
     int nb1;
 } pcs;
 
-#define UNALIGNED_INPUT inA
-
 block_q4_1 get_unaligned_block_q4_1(uint index) {
     block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
+    fres.d = u8BufToFloat16(inA, index);
+    fres.m = u8BufToFloat16(inA, index+2);
     [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
+        fres.qs[it] = inA[index+4+it];
     }
     return fres;
 }
 
-void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_1;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_1);
-
-        const float16_t d = block.d;
-        const float16_t m = block.m;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F);
-            const int x1 = (block.qs[j] >>   4);
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
-            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
-        }
-    }
+mat4 dequantize_block(uint index, uint il) {
+    const block_q4_1 block = get_unaligned_block_q4_1(index);
+    return dequantize_q4_1(block, il);
 }
 
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
+#include "op_getrows.comp"

From f1c9bc18216606b992a4b13b4154ddf97e443a92 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 09:05:22 -0400
Subject: [PATCH 22/93] Add q6_k getrows and mul*vec kernel.

---
 ggml-vulkan.cpp              |  47 ++++++++++++++
 kompute/op_getrows_q6_k.comp |  52 ++++++++++++++++
 kompute/op_mul_mat_q6_k.comp | 117 +++++++++++++++++++++++++++++++++++
 llama.cpp                    |   3 +-
 4 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 kompute/op_getrows_q6_k.comp
 create mode 100644 kompute/op_mul_mat_q6_k.comp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f770a2d0c..1dd504127 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -25,9 +25,11 @@
 #include "shaderop_mul_mat_f16.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
+#include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
+#include "shaderop_getrows_q6_k.h"
 #include "shaderop_rope.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
@@ -52,6 +54,7 @@
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
+#define QK_NL 16
 
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
@@ -958,6 +961,38 @@ void ggml_vk_mul_mat_q4_1(Args&&... args) {
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
+void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                          int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
+                          int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
+        kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne10, ne0, ne1, ne01, gqa;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne10, ne0, ne1, ne01, ne12/ne02
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+//        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                       unsigned element_size, unsigned qk,
                       kp::Sequence& seq,
@@ -1016,6 +1051,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK4_1, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+void ggml_vk_get_rows_q6_k(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
+        kp::shader_data::op_getrows_q6_k_comp_spv_len);
+    ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
+}
+
 void ggml_vk_rope(kp::Sequence& seq,
                   const std::shared_ptr<kp::Tensor>& in,
                   const std::shared_ptr<kp::Tensor>& out,
@@ -1297,6 +1339,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 case GGML_TYPE_Q4_1:
                                     ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                     break;
+                                case GGML_TYPE_Q6_K:
+                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                    break;
                                 default: {
                                     fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                                     goto not_implemented;
@@ -1312,6 +1357,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else if (src0t == GGML_TYPE_Q4_1) {
                             ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
+                        } else if (src0t == GGML_TYPE_Q6_K) {
+                            ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1));
                         } else {
                             fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t);
                             goto not_implemented;
diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp
new file mode 100644
index 000000000..95817b487
--- /dev/null
+++ b/kompute/op_getrows_q6_k.comp
@@ -0,0 +1,52 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define NL 16
+#define BYTES_FOR_TYPE 4 /*bytes for float*/
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { int inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int nb01;
+    int nb1;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+mat4 dequantize_block(uint index, uint il) {
+    const block_q6_k block = get_unaligned_block_q6_k(index);
+    return dequantize_q6_k(block, il);
+}
+
+#include "op_getrows.comp"
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
new file mode 100644
index 000000000..1e4ea37f8
--- /dev/null
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define SIZE_OF_BLOCK sizeof_block_q6_k
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y_id = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
+} pcs;
+
+block_q6_k get_unaligned_block_q6_k(uint index) {
+    block_q6_k fres;
+    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
+        fres.ql[it] = inA[index + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
+        fres.qh[it] = inA[index + QK_K/2 + it];
+    }
+    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
+        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
+    }
+    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
+    return fres;
+}
+
+void main() {
+    const uint8_t kmask1 = uint8_t(0x03);
+    const uint8_t kmask2 = uint8_t(0x0C);
+    const uint8_t kmask3 = uint8_t(0x30);
+    const uint8_t kmask4 = uint8_t(0xC0);
+
+    const int nb = pcs.ne00/QK_K;
+
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint r2 = gl_WorkGroupID.z;
+
+    const uint row = 2 * r0 + gl_SubgroupID;
+    const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
+    const uint x = row * nb + offset0; // Based from inA without base offset
+    const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
+
+    float sumf = 0;
+
+    const uint tid  = gl_SubgroupInvocationID/2;
+    const uint ix   = gl_SubgroupInvocationID%2;
+    const uint ip   = tid/8;         // 0 or 1
+    const uint il   = tid%8;
+    const uint n    = 4;
+    const uint l0   = n*il;
+    const uint is   = 8*ip + l0/16;
+
+    const uint y_offset = 128*ip + l0;
+    const uint q_offset_l = 64*ip + l0;
+    const uint q_offset_h = 32*ip + l0;
+
+    for (uint i = ix; i < nb; i += 2) {
+
+        const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
+//        const block_q6_k block = get_unaligned_block_q6_k(index);
+
+        const uint qlIndex = q_offset_l;
+        const uint q2Index = qlIndex + 32;
+        const uint qhIndex = q_offset_h;
+        const uint y = yy + i * QK_K + y_offset;
+
+        float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        for (uint l = 0; l < n; ++l) {
+
+//            const uint8_t currentQ1 = block.ql[qlIndex + l];
+//            const uint8_t currentQ2 = block.ql[q2Index + l];
+//            const uint8_t currentQh = block.qh[qhIndex + l];
+            const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
+            const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
+            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
+
+            sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
+            sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
+            sums[2] += inB[y+l+64] * (int8_t((currentQ1  >> 4) | ((currentQh & kmask3) << 0)) - 32);
+            sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
+        }
+
+//        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
+        float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
+        sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
+    }
+
+    const float tot = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
+    }
+}
diff --git a/llama.cpp b/llama.cpp
index 245174898..603f7cc64 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6496,7 +6496,8 @@ struct llama_context * llama_new_context_with_model(
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 06d4b21598da0162999b35429cfb567ed962d7ec Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 11:30:10 -0400
Subject: [PATCH 23/93] Fix offset into the qh and now we have working vulkan
 accelerated for gguff'd llama.

---
 kompute/op_mul_mat_q6_k.comp | 26 ++------------------------
 1 file changed, 2 insertions(+), 24 deletions(-)

diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index 1e4ea37f8..c7b9aa753 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -32,28 +32,13 @@ layout (push_constant) uniform parameter {
     int gqa;
 } pcs;
 
-block_q6_k get_unaligned_block_q6_k(uint index) {
-    block_q6_k fres;
-    [[unroll]] for (uint it = 0; it != QK_K / 2; it++) {
-        fres.ql[it] = inA[index + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 4; it++) {
-        fres.qh[it] = inA[index + QK_K/2 + it];
-    }
-    [[unroll]] for (uint it = 0; it != QK_K / 16; it++) {
-        fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]);
-    }
-    fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16);
-    return fres;
-}
-
 void main() {
     const uint8_t kmask1 = uint8_t(0x03);
     const uint8_t kmask2 = uint8_t(0x0C);
     const uint8_t kmask3 = uint8_t(0x30);
     const uint8_t kmask4 = uint8_t(0xC0);
 
-    const int nb = pcs.ne00/QK_K;
+    const uint nb = pcs.ne00/QK_K;
 
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -81,8 +66,6 @@ void main() {
     for (uint i = ix; i < nb; i += 2) {
 
         const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-//        const uint index = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
-//        const block_q6_k block = get_unaligned_block_q6_k(index);
 
         const uint qlIndex = q_offset_l;
         const uint q2Index = qlIndex + 32;
@@ -91,13 +74,9 @@ void main() {
 
         float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f};
         for (uint l = 0; l < n; ++l) {
-
-//            const uint8_t currentQ1 = block.ql[qlIndex + l];
-//            const uint8_t currentQ2 = block.ql[q2Index + l];
-//            const uint8_t currentQh = block.qh[qhIndex + l];
             const uint8_t currentQ1 = inA[baseIndex + qlIndex + l];
             const uint8_t currentQ2 = inA[baseIndex + q2Index + l];
-            const uint8_t currentQh = inA[baseIndex + qhIndex + l];
+            const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l];
 
             sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32);
             sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32);
@@ -105,7 +84,6 @@ void main() {
             sums[3] += inB[y+l+96] * (int8_t((currentQ2  >> 4) | ((currentQh & kmask4) >> 2)) - 32);
         }
 
-//        sumf += block.d * (sums[0] * block.scales[0+is] + sums[1] * block.scales[2+is] + sums[2] * block.scales[4+is] + sums[3] * block.scales[6+is]);
         float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16);
         sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is]));
     }

From 32289aa447344fa8a5a8d9f6289af41fb15fd910 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 2 Oct 2023 21:00:48 -0400
Subject: [PATCH 24/93] Fixes for norm.

---
 kompute/op_norm.comp    | 2 +-
 kompute/op_rmsnorm.comp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 4b2db25e3..5aafeaac5 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -56,7 +56,7 @@ void main() {
     const float mean = sum[0];
 
     // recenter
-    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
         out_[y+i00] = in_[x+i00] - mean;
     }
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index dd2c5cdde..8d6c0fa6a 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -10,7 +10,7 @@
 
 #include "common.comp"
 
-#define nth 256
+#define nth 512
 
 layout(local_size_x = nth) in;
 
@@ -56,7 +56,7 @@ void main() {
 
     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
 
-    const uint y = (gl_WorkGroupID.x*pcs.ne00/4) + pcs.outOff; // Based from out_
+    const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
     for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
         out_[y+i00] = in_[x+i00] * scale;
     }

From 6ac39752bf8f1e3596386238fd3d0e68aaf2dfd5 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 3 Oct 2023 12:40:24 -0400
Subject: [PATCH 25/93] Fixup the upstream CMakelists.txt so we can build just
 llama.cpp with our branch.

---
 CMakeLists.txt | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 31532df91..2445d177c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -414,6 +414,7 @@ if (LLAMA_HIPBLAS)
 endif()
 
 if (LLAMA_KOMPUTE)
+    add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
     find_package(Vulkan COMPONENTS glslc REQUIRED)
     find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
     if (NOT glslc_executable)
@@ -429,8 +430,11 @@ if (LLAMA_KOMPUTE)
         set(spv_file ${source}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
-            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_mul_mv_q_n.comp
+              COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
             COMMENT "Compiling ${source} to ${source}.spv"
         )
 
@@ -478,9 +482,11 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q4_0.comp
           kompute/op_mul_mat_q4_1.comp
+          kompute/op_mul_mat_q6_k.comp
           kompute/op_getrows_f16.comp
           kompute/op_getrows_q4_0.comp
           kompute/op_getrows_q4_1.comp
+          kompute/op_getrows_q6_k.comp
           kompute/op_rope.comp
           kompute/op_cpy_f16_f16.comp
           kompute/op_cpy_f16_f32.comp
@@ -505,9 +511,11 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q4_0.h
           shaderop_mul_mat_q4_1.h
+          shaderop_mul_mat_q6_k.h
           shaderop_getrows_f16.h
           shaderop_getrows_q4_0.h
           shaderop_getrows_q4_1.h
+          shaderop_getrows_q6_k.h
           shaderop_rope.h
           shaderop_cpy_f16_f16.h
           shaderop_cpy_f16_f32.h

From de589ced7cea1e9d5a352668e905986a92efc866 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Tue, 3 Oct 2023 13:30:23 -0400
Subject: [PATCH 26/93] Change this back to be in agreement with metal and our
 previous softmax kernel.

---
 ggml-vulkan.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 1dd504127..2326f56b5 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -785,7 +785,7 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);

From bc4b5ed1cb2ea9bdf71c0ea4356bfcc7f4a988b3 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Wed, 4 Oct 2023 14:24:35 -0400
Subject: [PATCH 27/93] Fixes for subgroup size to bring AMD and NVIDIA inline
 with eachother for all kernels.

---
 ggml-vulkan.cpp              |  7 ++++---
 kompute/op_mul_mat_q6_k.comp | 27 +++++++++++++++++----------
 kompute/op_mul_mv_q_n.comp   |  7 +++++--
 kompute/op_softmax.comp      |  9 ++++++---
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 2326f56b5..86794e886 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -785,7 +785,8 @@ void ggml_vk_soft_max(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device
+        const uint32_t local_x = 32;
         s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
@@ -981,8 +982,8 @@ void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-//        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {2,32}, {pushConsts});
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index c7b9aa753..6148053b2 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -44,31 +44,38 @@ void main() {
     const uint r1 = gl_WorkGroupID.y;
     const uint r2 = gl_WorkGroupID.z;
 
-    const uint row = 2 * r0 + gl_SubgroupID;
+    const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
     const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
     const uint x = row * nb + offset0; // Based from inA without base offset
     const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
 
     float sumf = 0;
 
-    const uint tid  = gl_SubgroupInvocationID/2;
-    const uint ix   = gl_SubgroupInvocationID%2;
-    const uint ip   = tid/8;         // 0 or 1
-    const uint il   = tid%8;
-    const uint n    = 4;
-    const uint l0   = n*il;
-    const uint is   = 8*ip + l0/16;
+    // bits of invocation ID for gl_SubgroupSize=32:
+    //  x   x   x   x   x
+    //  4   3   2   1   0
+    // (     tid     ) ix
+    //  ip (   il    )
+
+    const uint block_stride = gl_SubgroupSize / 16;         // number of blocks each subgroup processes
+    const uint tid  = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0
+    const uint ix   = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1
+    const uint ip   = tid/8;        // first or second half of block (0 or 1)
+    const uint il   = tid%8;        // each half has 8 parts, one per scale
+    const uint n    = 4;            // 4 scales at a time (and 4 sums)
+    const uint l0   = n*il;         // offset into half-block, 0..28
+    const uint is   = 8*ip + l0/16; // 0, 1, 8, 9
 
     const uint y_offset = 128*ip + l0;
     const uint q_offset_l = 64*ip + l0;
     const uint q_offset_h = 32*ip + l0;
 
-    for (uint i = ix; i < nb; i += 2) {
+    for (uint i = ix; i < nb; i += block_stride) {
 
         const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff;
 
         const uint qlIndex = q_offset_l;
-        const uint q2Index = qlIndex + 32;
+        const uint q2Index = qlIndex + QK_K/8;
         const uint qhIndex = q_offset_h;
         const uint y = yy + i * QK_K + y_offset;
 
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index 15bcbf765..a9b64fe16 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -7,6 +7,9 @@
  */
 
 void main() {
+    if (gl_SubgroupInvocationID > 31)
+        return;
+
     const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT);
     const uint r0 = gl_WorkGroupID.x;
     const uint r1 = gl_WorkGroupID.y;
@@ -28,13 +31,13 @@ void main() {
     //    gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize,
     //    gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z);
 
-    for (uint ib = ix; ib < nb; ib += gl_SubgroupSize/2) {
+    for (uint ib = ix; ib < nb; ib += 16) {
         for (int row = 0; row < N_ROWS; row++) {
             const uint block_index = x + ib + row * nb;
             sumf[row] += block_q_n_dot_y(block_index, yb, il);
         }
 
-        yb += BLOCKS_IN_QUANT * gl_SubgroupSize/2;
+        yb += BLOCKS_IN_QUANT * 16;
     }
 
     for (int row = 0; row < N_ROWS; ++row) {
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index d21577ac0..30b6f0260 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -24,6 +24,9 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
+    if (gl_SubgroupInvocationID > 31)
+        return;
+
     const uint i03 = gl_WorkGroupID.z;
     const uint i02 = gl_WorkGroupID.y;
     const uint i01 = gl_WorkGroupID.x;
@@ -34,21 +37,21 @@ void main() {
 
     // parallel max
     float localMax = uintBitsToFloat(0xFF800000);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         localMax = max(localMax, in_[psrc0 + i00]);
     }
     float max_ = subgroupMax(localMax);
 
     // parallel sum
     float localSum = 0.0f;
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         const float exp_psrc0 = exp(in_[psrc0 + i00] - max_);
         localSum += exp_psrc0;
         out_[pdst + i00] = exp_psrc0;
     }
 
     const float sum = subgroupAdd(localSum);
-    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += gl_SubgroupSize) {
+    for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
         out_[pdst + i00] /= sum;
     }
 }

From 24a4a5956af130148d6cee6bdb5397bf3e5ce824 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 4 Oct 2023 16:16:04 -0400
Subject: [PATCH 28/93] kompute : only try to use Vulkan for LLaMA itself

---
 llama.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 603f7cc64..6e7a53407 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6492,7 +6492,9 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
+    // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
+        && model->arch == LLM_ARCH_LLAMA
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0

From 3d850db7671a48dd290ea543859f3b594dc4e0a0 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 4 Oct 2023 16:19:19 -0400
Subject: [PATCH 29/93] kompute : remove Q6_K from list of supported quant
 types

---
 llama.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 6e7a53407..e79251194 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6498,8 +6498,7 @@ struct llama_context * llama_new_context_with_model(
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q6_K)) {
+            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From 9db90cbe1215b7850c1b3cbc10508931f55a3141 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:49:55 -0700
Subject: [PATCH 30/93] f16 mv broadcasting fix (gqa fix)

---
 ggml-vulkan.cpp             | 11 ++++++-----
 kompute/op_mul_mat_f16.comp |  9 +++++++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 86794e886..bf732be32 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -884,7 +884,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
                          uint32_t nb01, uint32_t nb02,
                          int32_t ne11, int32_t ne12,
                          uint32_t nb11, uint32_t nb12,
@@ -897,20 +897,21 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
         int32_t ne00;
         uint32_t nb01, nb02;
         uint32_t nb11, nb12;
+        int32_t ne02, ne12;
         int32_t ne0, ne1;
     } pushConsts {
         safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+        ne00, nb01, nb02, nb11, nb12, ne02, ne12, ne0, ne1,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
         const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))}, {local_x}, {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(std::max(ne12, ne02))});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1332,7 +1333,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             switch (src0t) {
                                 case GGML_TYPE_F16:
                                 case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                     break;
                                 case GGML_TYPE_Q4_0:
                                     ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index 72a667f92..b56d14f77 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -27,6 +27,8 @@ layout (push_constant) uniform parameter {
     uint nb02;
     uint nb11;
     uint nb12;
+    uint ne02;
+    uint ne12;
     int ne0;
     int ne1;
 } pcs;
@@ -36,8 +38,11 @@ void main() {
     const uint r1 = gl_WorkGroupID.y;
     const uint im = gl_WorkGroupID.z;
 
-    const uint x = (r0*pcs.nb01 + im*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
-    const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+    uint bc_ab = pcs.ne12 > pcs.ne02 ? im / (pcs.ne12 / pcs.ne02) : im;
+    uint bc_ba = pcs.ne02 > pcs.ne12 ? im / (pcs.ne02 / pcs.ne12) : im;
+
+    const uint x = (r0*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+    const uint y = (r1*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
 
     float sumf = 0.0f;
     for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {

From ff4212d20fcbc675106efb19c5278af60e18e97d Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:02:17 -0700
Subject: [PATCH 31/93] q8 mat*vec

---
 CMakeLists.txt               |  2 ++
 ggml-vulkan.cpp              | 41 +++++++++++++++++++++++
 kompute/op_mul_mat_q8_0.comp | 64 ++++++++++++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 kompute/op_mul_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2445d177c..c0538eb88 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -480,6 +480,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
           kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
           kompute/op_mul_mat_q4_1.comp
           kompute/op_mul_mat_q6_k.comp
@@ -509,6 +510,7 @@ if (LLAMA_KOMPUTE)
           shaderop_rmsnorm.h
           shaderop_diagmask.h
           shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
           shaderop_mul_mat_q4_1.h
           shaderop_mul_mat_q6_k.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index bf732be32..59852c649 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -23,6 +23,7 @@
 #include "shaderop_rmsnorm.h"
 #include "shaderop_diagmask.h"
 #include "shaderop_mul_mat_f16.h"
+#include "shaderop_mul_mat_q8_0.h"
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
@@ -918,6 +919,43 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         int32_t ne0, int32_t ne1) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv,
+        kp::shader_data::op_mul_mat_q8_0_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        int32_t ne0, ne1;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, nb01, nb02, nb11, nb12, ne0, ne1,
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne11), unsigned(ne12)});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1335,6 +1373,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 case GGML_TYPE_F32:
                                     ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                     break;
+                                case GGML_TYPE_Q8_0:
+                                    ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                    break;
                                 case GGML_TYPE_Q4_0:
                                     ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
                                     break;
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute/op_mul_mat_q8_0.comp
new file mode 100644
index 000000000..2ba48127b
--- /dev/null
+++ b/kompute/op_mul_mat_q8_0.comp
@@ -0,0 +1,64 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#define BLOCKS_IN_QUANT QK8_0
+#define SIZE_OF_BLOCK sizeof_block_q8_0
+#define N_ROWS 4
+
+layout(local_size_x_id = 0) in;
+layout(local_size_y = 1) in;
+layout(local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout (binding = 1) readonly buffer tensorInB { float inB[]; };
+layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int ne00;
+    int ne10;
+    int ne0;
+    int ne1;
+    int ne01;
+    int gqa;
+} pcs;
+
+#define ELS_PER_BLOCK 32
+#define SIZE_OF_D 2
+#define BLOCK_SIZE (ELS_PER_BLOCK + SIZE_OF_D)
+
+void main() {
+    const uint r0 = gl_WorkGroupID.x;
+    const uint r1 = gl_WorkGroupID.y;
+    const uint im = gl_WorkGroupID.z;
+
+    const uint x = r0 * (pcs.ne00/ELS_PER_BLOCK) * BLOCK_SIZE + pcs.inAOff; // Based from inA
+    const uint y = r1 * pcs.ne10 + pcs.inBOff; // based from inB
+
+    float sumf = 0.0f;
+    for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
+        const uint block_number = i / ELS_PER_BLOCK;
+        const uint block_offset = block_number * BLOCK_SIZE;
+        const float d = u8BufToFloat16(inA, x + block_offset);
+        const uint position_in_block = i % ELS_PER_BLOCK;
+        const int q = int8_t(inA[x+block_offset+SIZE_OF_D+position_in_block]);
+        const float dq = d * q;
+        sumf += dq * float(inB[y+i]);
+    }
+
+    const float all_sum = subgroupAdd(sumf);
+    if (subgroupElect()) {
+        out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum;
+    }
+}

From 020b1745a02e255fb059b575e0ca63248c84dd31 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 23:36:24 -0700
Subject: [PATCH 32/93] vulkan: implement neox mode for rope

---
 kompute/op_rope.comp | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kompute/op_rope.comp b/kompute/op_rope.comp
index 3fa84f579..8c2854636 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope.comp
@@ -63,6 +63,25 @@ void main() {
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
-        // TODO: implement
+        const float inv_ndims = -1.f/pcs.n_dims;
+        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
+            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);
+
+                theta *= theta_scale;
+
+                const uint i0 = ib*pcs.n_dims + ic/2;
+
+                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+                const float x0 = in_[src];
+                const float x1 = in_[src+pcs.n_dims/2];
+
+                out_[dst_data] = x0*cos_theta - x1*sin_theta;
+                out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+            }
+        }
     }
 }

From 8564f79036c724615f1677138d5e6ed5f61075ae Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 4 Oct 2023 21:03:27 -0700
Subject: [PATCH 33/93] falcon h2d + reenable vulkan

---
 llama.cpp | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e79251194..858494244 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3210,6 +3210,9 @@ static struct ggml_cgraph * llm_build_falcon(
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
+#if defined(GGML_USE_KOMPUTE)
+    struct ggml_tensor * toDeviceTensor = nullptr;
+#endif
 
     if (tokens) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -3219,7 +3222,9 @@ static struct ggml_cgraph * llm_build_falcon(
             memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
         }
         ggml_set_name(inp_tokens, "inp_tokens");
-
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inp_tokens;
+#endif
         inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
     } else {
 #ifdef GGML_USE_MPI
@@ -3232,6 +3237,9 @@ static struct ggml_cgraph * llm_build_falcon(
         if (!ggml_allocr_is_measure(lctx.alloc)) {
             memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
         }
+#if defined(GGML_USE_KOMPUTE)
+        toDeviceTensor = inpL;
+#endif
     }
 
     const int i_gpu_start = n_layer - n_gpu_layers;
@@ -3463,6 +3471,16 @@ static struct ggml_cgraph * llm_build_falcon(
     ggml_build_forward_expand(gf, cur);
 
     ggml_free(ctx0);
+ 
+#if defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute) {
+        if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        } else {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+        }
+    }
+#endif
 
     return gf;
 }
@@ -6494,7 +6512,7 @@ struct llama_context * llama_new_context_with_model(
 #elif defined(GGML_USE_KOMPUTE)
     // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
-        && model->arch == LLM_ARCH_LLAMA
+        && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
         && (model->ftype == LLAMA_FTYPE_ALL_F32
             || model->ftype == LLAMA_FTYPE_MOSTLY_F16
             || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0

From 09d83f04013f9e8551c3ff54449cf28e1ca00784 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 5 Oct 2023 10:52:04 -0400
Subject: [PATCH 34/93] Delete TODO now that we have q8_0.

---
 llama.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 858494244..f5e0eac81 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6510,7 +6510,6 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
-    // TODO(cebtenzzre): we need to check the type of each tensor because Q8_0 is not currently supported
     if (ggml_vk_has_device() && params.n_gpu_layers > 0
         && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
         && (model->ftype == LLAMA_FTYPE_ALL_F32

From f0cd38b9adfa2105c2a19c4fd02edf71e1d1135a Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 10 Oct 2023 21:37:07 -0700
Subject: [PATCH 35/93] add mat*mat ops

---
 CMakeLists.txt                   |   8 +
 ggml-vulkan.cpp                  | 266 ++++++++++++++++++++++++++++++-
 kompute/op_mul_mat_mat_f16.comp  |  56 +++++++
 kompute/op_mul_mat_mat_f32.comp  |  53 ++++++
 kompute/op_mul_mat_mat_q4_0.comp |  77 +++++++++
 kompute/op_mul_mat_mat_q8_0.comp |  66 ++++++++
 llama.cpp                        |   2 +-
 7 files changed, 521 insertions(+), 7 deletions(-)
 create mode 100644 kompute/op_mul_mat_mat_f16.comp
 create mode 100644 kompute/op_mul_mat_mat_f32.comp
 create mode 100644 kompute/op_mul_mat_mat_q4_0.comp
 create mode 100644 kompute/op_mul_mat_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0538eb88..cf4042ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,6 +479,10 @@ if (LLAMA_KOMPUTE)
           kompute/op_norm.comp
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
+          kompute/op_mul_mat_mat_f16.comp
+          kompute/op_mul_mat_mat_f32.comp
+          kompute/op_mul_mat_mat_q4_0.comp
+          kompute/op_mul_mat_mat_q8_0.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -509,6 +513,10 @@ if (LLAMA_KOMPUTE)
           shaderop_norm.h
           shaderop_rmsnorm.h
           shaderop_diagmask.h
+          shaderop_mul_mat_mat_f16.h
+          shaderop_mul_mat_mat_f32.h
+          shaderop_mul_mat_mat_q4_0.h
+          shaderop_mul_mat_mat_q8_0.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 59852c649..6ae1a8fc3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -27,6 +27,10 @@
 #include "shaderop_mul_mat_q4_0.h"
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
+#include "shaderop_mul_mat_mat_f32.h"
+#include "shaderop_mul_mat_mat_f16.h"
+#include "shaderop_mul_mat_mat_q4_0.h"
+#include "shaderop_mul_mat_mat_q8_0.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -938,7 +942,7 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
         uint32_t nb11, nb12;
         int32_t ne0, ne1;
     } pushConsts {
-        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
         ne00, nb01, nb02, nb11, nb12, ne0, ne1,
     };
 
@@ -956,6 +960,211 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+
+void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv,
+        kp::shader_data::op_mul_mat_mat_f32_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        //std::cerr << "init f32 matmat shader" << std::endl;
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(ne12)},
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02))});
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    //seq.record<kp::OpTensorFill>({out});
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
+                          const std::shared_ptr<kp::Tensor>& inA,
+                          const std::shared_ptr<kp::Tensor>& inB,
+                          const std::shared_ptr<kp::Tensor>& out,
+                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f16_comp_spv,
+        kp::shader_data::op_mul_mat_mat_f16_comp_spv_len);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+void ggml_vk_mul_mat_mat_q8_0(
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q8_0_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q8_0_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01),
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))},
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01),
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
+
+
+template <typename... Args>
+void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_0_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q4_0_comp_spv_len);
+
+    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1357,16 +1566,61 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_MUL_MAT:
                     {
                         if (src1t != GGML_TYPE_F32) {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
                         }
 
                         if (!ggml_is_transposed(src0)
                             && !ggml_is_transposed(src1)
-                            && ne00%32 == 0
-                            && ne11 > 1) {
-                            fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                            goto not_implemented;
+                            //&& ne00%32 == 0
+                            && ne11 > 1
+                            ) {
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    ggml_vk_mul_mat_mat_f32(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_F16:
+                                    ggml_vk_mul_mat_mat_f16(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_Q4_0:
+                                    ggml_vk_mul_mat_mat_q4_0(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                case GGML_TYPE_Q8_0:
+                                    ggml_vk_mul_mat_mat_q8_0(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
+                                default: {
+                                    fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                    goto not_implemented;
+                                }
+                            }
                         } else {
                             switch (src0t) {
                                 case GGML_TYPE_F16:
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
new file mode 100644
index 000000000..b62f06d10
--- /dev/null
+++ b/kompute/op_mul_mat_mat_f16.comp
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i ++) {
+      sum += float(inA[x+i]) * float(inB[y+i]);
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
new file mode 100644
index 000000000..6234322ca
--- /dev/null
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -0,0 +1,53 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { float inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  const uint x = (gid.x*pcs.nb01 + gid.z/(pcs.ne12/pcs.ne02)*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + gid.z/(pcs.ne02/pcs.ne12)*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i ++) {
+      sum += float(inA[x+i]) * float(inB[y+i]);
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
new file mode 100644
index 000000000..93dcfdaed
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -0,0 +1,77 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define QS_OFFSET 2
+#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
+    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint byte_position_in_block = j;
+      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F) - 8;
+      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
+      const float dq0 = d * q0;
+      const float dq1 = d * q1;
+      // if (gid.x == 0 && gid.y == 0 && gid.z == 0 && i < 4 && j < 4) {
+      //   debugPrintfEXT("shp=%d,%d,%d gid=%d,%d,%d i=%d, d=%f, q0=%d, q1=%d, dqs=%f,%f\n",
+      //     pcs.ne01, pcs.ne11, pcs.ne12,
+      //     gid.x, gid.y, gid.z, i, d, q0, q1, dq0, dq1
+      //   );
+      // }
+      sum += (dq0 * float(inB[y+i+j])) + \
+             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q8_0.comp b/kompute/op_mul_mat_mat_q8_0.comp
new file mode 100644
index 000000000..715e533e2
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q8_0.comp
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+// layout(local_size_x = 8) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define QS_OFFSET 2 // d
+#define BLOCK_SIZE (ELS_PER_BLOCK + 2)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint position_in_block = i % ELS_PER_BLOCK;
+      const int q0 = int8_t(inA[x+block_offset+QS_OFFSET+position_in_block]);
+      const float dq0 = d * q0;
+      sum += (dq0 * float(inB[y+i]));
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index f5e0eac81..0ff459ba5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3855,7 +3855,7 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && N == 1) {
+    if (lctx.ctx_kompute) { // && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {

From 46385ee0d52f38fc7db2a0ec3a071ae8d1bd6511 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Tue, 10 Oct 2023 21:38:18 -0700
Subject: [PATCH 36/93] misc vulkan cleanup

make pushconts consistent w/ dispatch, avoid a double free
---
 ggml-vulkan.cpp     | 4 +++-
 kompute/op_add.comp | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 6ae1a8fc3..a0a2a9b0e 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -307,7 +307,9 @@ bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
-    delete ctx;
+    if (ctx != nullptr) {
+        delete ctx;
+    }
 }
 
 static
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 019a68449..f242864dd 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -20,7 +20,6 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
-    uint row;
 } pcs;
 
 void main() {

From 3327d84a7fba14ad0b2778982013a88c808a1132 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 16:02:53 -0700
Subject: [PATCH 37/93] perf: use bigger threadgroups in mm

---
 ggml-vulkan.cpp                  | 2 +-
 kompute/op_mul_mat_mat_q4_0.comp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a0a2a9b0e..57813cb3d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1148,7 +1148,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
+        s_algo->setWorkgroup({unsigned(ne01)/32,
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index 93dcfdaed..994aadc8a 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -14,7 +14,7 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+layout(local_size_x = 32) in;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };

From d5741c07a53f86f4d987b7e22f87a72e1da70e46 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 18:40:07 -0700
Subject: [PATCH 38/93] use op param epsilon for norms

---
 ggml-vulkan.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 57813cb3d..f2320f3cc 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -810,12 +810,10 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
                    int32_t ne00, int32_t nb01,
-                   int32_t nrows) {
+                   int32_t nrows, float epsilon) {
     GGML_ASSERT(nb01%sizeof(float) == 0);
     GGML_ASSERT(ne00%sizeof(float) == 0);
 
-    const float epsilon = 1e-6f; // this is what ggml.c uses for rms norm
-
     struct PushConstants {
         uint32_t inOff, outOff;
         uint32_t ne00, nb01;
@@ -1559,11 +1557,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_NORM:
                     {
-                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+                        ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_RMS_NORM:
                     {
-                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0));
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+                        ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);
                     } break;
                 case GGML_OP_MUL_MAT:
                     {

From b78a94bc6d72c42bf1f1ac9a867ef232ddc26b04 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Wed, 11 Oct 2023 17:10:42 -0700
Subject: [PATCH 39/93] q6k mm works

---
 CMakeLists.txt                   |  2 +
 ggml-vulkan.cpp                  | 61 +++++++++++++++++++++-
 kompute/op_mul_mat_mat_q6_k.comp | 88 ++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 kompute/op_mul_mat_mat_q6_k.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf4042ea3..fbbb46bbf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -483,6 +483,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_mat_f32.comp
           kompute/op_mul_mat_mat_q4_0.comp
           kompute/op_mul_mat_mat_q8_0.comp
+          kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -517,6 +518,7 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_mat_f32.h
           shaderop_mul_mat_mat_q4_0.h
           shaderop_mul_mat_mat_q8_0.h
+          shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index f2320f3cc..488683ec3 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -31,6 +31,7 @@
 #include "shaderop_mul_mat_mat_f16.h"
 #include "shaderop_mul_mat_mat_q4_0.h"
 #include "shaderop_mul_mat_mat_q8_0.h"
+#include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -1109,6 +1110,54 @@ void ggml_vk_mul_mat_mat_q8_0(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
+void ggml_vk_mul_mat_mat_q6_k(
+                         kp::Sequence& seq,
+                         const std::shared_ptr<kp::Tensor>& inA,
+                         const std::shared_ptr<kp::Tensor>& inB,
+                         const std::shared_ptr<kp::Tensor>& out,
+                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+                         int32_t ne00, int32_t ne01, int32_t ne02,
+                         uint32_t nb01, uint32_t nb02,
+                         int32_t ne11, int32_t ne12,
+                         uint32_t nb11, uint32_t nb12,
+                         uint32_t nb1, uint32_t nb2) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q6_k_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q6_k_comp_spv_len);
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+        int32_t ne00, ne01, ne02, ne11, ne12;
+        uint32_t nb01, nb02;
+        uint32_t nb11, nb12;
+        uint32_t nb1, nb2;
+    } pushConsts {
+        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00, ne01, ne02, ne11, ne12,
+        nb01, nb02, nb11, nb12,
+        nb1, nb2
+    };
+
+    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        {inA, inB, out}, spirv,
+        {unsigned(ne01)/32,
+         unsigned(ne11),
+         unsigned(std::max(ne12, ne02))
+         },
+        {},
+        {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(__func__);
+        s_algo->setTensors({inA, inB, out});
+        s_algo->setWorkgroup({unsigned(ne01)/32,
+                              unsigned(ne11),
+                              unsigned(std::max(ne12, ne02)),
+                              });
+        s_algo->setPushConstants<PushConstants>({pushConsts});
+        s_algo->updateDescriptors(s_kompute_context->pool.get());
+    }
+    seq.record<kp::OpAlgoDispatch>(s_algo);
+}
 
 void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
                          kp::Sequence& seq,
@@ -1138,7 +1187,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
     if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01),
+        {unsigned(ne01)/32,
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
         {},
@@ -1619,6 +1668,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                                break;
+                                case GGML_TYPE_Q6_K:
+                                    ggml_vk_mul_mat_mat_q6_k(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
                                     break;
                                 default: {
                                     fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
new file mode 100644
index 000000000..127f17df6
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q6_k.comp
@@ -0,0 +1,88 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+layout(local_size_x = 32) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+
+#define ELS_PER_BLOCK 256  //QK_K
+#define QH_OFFSET (ELS_PER_BLOCK / 2)
+#define QSCALES_OFFSET (QH_OFFSET + (ELS_PER_BLOCK / 4))
+#define SCALE_SCALE_OFFSET (QSCALES_OFFSET + (ELS_PER_BLOCK / 16))
+#define BLOCK_SIZE (SCALE_SCALE_OFFSET + 2)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+
+  float sum = 0.0f;
+  const uint n_blocks = pcs.ne00 / ELS_PER_BLOCK;
+  // this is pretty much all lifted right from dequantize_row_q6_K
+  uint outoff = 0;
+  for (uint i = 0; i < n_blocks; i++) {
+    const uint block_number = i;
+    const uint block_offset = block_number * BLOCK_SIZE;
+    const float scales_d = u8BufToFloat16(inA, x + block_offset + SCALE_SCALE_OFFSET);
+    uint qloff = block_offset;
+    uint qhoff = block_offset + QH_OFFSET;
+    uint scoff = block_offset + QSCALES_OFFSET;
+    for (int n = 0; n < 256; n += 128) {
+        for (int l = 0; l < 32; ++l) {
+            int is = l/16;
+            const int q1 = int((inA[x + qloff + l +  0] & 0xF) | (((inA[x + qhoff + l] >> 0) & 3) << 4)) - 32;
+            const int q2 = int((inA[x + qloff + l + 32] & 0xF) | (((inA[x + qhoff + l] >> 2) & 3) << 4)) - 32;
+            const int q3 = int((inA[x + qloff + l +  0]  >> 4) | (((inA[x + qhoff + l] >> 4) & 3) << 4)) - 32;
+            const int q4 = int((inA[x + qloff + l + 32]  >> 4) | (((inA[x + qhoff + l] >> 6) & 3) << 4)) - 32;
+            sum += inB[y + outoff + l +  0] * scales_d * int8_t(inA[x + scoff + is + 0]) * q1;
+            sum += inB[y + outoff + l + 32] * scales_d * int8_t(inA[x + scoff + is + 2]) * q2;
+            sum += inB[y + outoff + l + 64] * scales_d * int8_t(inA[x + scoff + is + 4]) * q3;
+            sum += inB[y + outoff + l + 96] * scales_d * int8_t(inA[x + scoff + is + 6]) * q4;
+        }
+        outoff += 128;
+        qloff += 64;
+        qhoff += 32;
+        scoff += 8;
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}
\ No newline at end of file

From 4809890d805ff27752fd344a281250888a86acdd Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 10:23:09 -0700
Subject: [PATCH 40/93] rm commented dbg print

---
 kompute/op_mul_mat_mat_q4_0.comp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index 994aadc8a..aecd04cca 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -62,12 +62,6 @@ void main() {
       const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
       const float dq0 = d * q0;
       const float dq1 = d * q1;
-      // if (gid.x == 0 && gid.y == 0 && gid.z == 0 && i < 4 && j < 4) {
-      //   debugPrintfEXT("shp=%d,%d,%d gid=%d,%d,%d i=%d, d=%f, q0=%d, q1=%d, dqs=%f,%f\n",
-      //     pcs.ne01, pcs.ne11, pcs.ne12,
-      //     gid.x, gid.y, gid.z, i, d, q0, q1, dq0, dq1
-      //   );
-      // }
       sum += (dq0 * float(inB[y+i+j])) + \
              (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
     }

From cd0257ed0d748465d5753eeff74dffea92d91641 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 11:22:31 -0700
Subject: [PATCH 41/93] q4_1 mat*mat

---
 CMakeLists.txt                   |  2 +
 ggml-vulkan.cpp                  | 19 +++++++++
 kompute/op_mul_mat_mat_q4_1.comp | 73 ++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 kompute/op_mul_mat_mat_q4_1.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbbb46bbf..df6b53dce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -482,6 +482,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_mul_mat_mat_f16.comp
           kompute/op_mul_mat_mat_f32.comp
           kompute/op_mul_mat_mat_q4_0.comp
+          kompute/op_mul_mat_mat_q4_1.comp
           kompute/op_mul_mat_mat_q8_0.comp
           kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
@@ -517,6 +518,7 @@ if (LLAMA_KOMPUTE)
           shaderop_mul_mat_mat_f16.h
           shaderop_mul_mat_mat_f32.h
           shaderop_mul_mat_mat_q4_0.h
+          shaderop_mul_mat_mat_q4_1.h
           shaderop_mul_mat_mat_q8_0.h
           shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 488683ec3..56f15310d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -30,6 +30,7 @@
 #include "shaderop_mul_mat_mat_f32.h"
 #include "shaderop_mul_mat_mat_f16.h"
 #include "shaderop_mul_mat_mat_q4_0.h"
+#include "shaderop_mul_mat_mat_q4_1.h"
 #include "shaderop_mul_mat_mat_q8_0.h"
 #include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
@@ -1214,6 +1215,14 @@ void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
     ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
 }
 
+template <typename... Args>
+void ggml_vk_mul_mat_mat_q4_1(Args&&... args) {
+    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_1_comp_spv,
+        kp::shader_data::op_mul_mat_mat_q4_1_comp_spv_len);
+
+    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
+}
+
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1659,6 +1668,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         nb11, nb12,
                                         nb1, nb2);
                                     break;
+                                case GGML_TYPE_Q4_1:
+                                    ggml_vk_mul_mat_mat_q4_1(seq,
+                                        id_src0, id_src1, id_dst,
+                                        off_src0, off_src1, off_dst,
+                                        ne00, ne01, ne02,
+                                        nb01, nb02,
+                                        ne11, ne12,
+                                        nb11, nb12,
+                                        nb1, nb2);
+                                    break;
                                 case GGML_TYPE_Q8_0:
                                     ggml_vk_mul_mat_mat_q8_0(seq,
                                         id_src0, id_src1, id_dst,
diff --git a/kompute/op_mul_mat_mat_q4_1.comp b/kompute/op_mul_mat_mat_q4_1.comp
new file mode 100644
index 000000000..d7fbc96db
--- /dev/null
+++ b/kompute/op_mul_mat_mat_q4_1.comp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models
+ * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
+ * of this license should accompany this software. Except as expressly granted
+ * in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+#extension GL_KHR_shader_subgroup_arithmetic : require
+#extension GL_EXT_debug_printf : enable
+
+layout(local_size_x = 32) in;
+
+layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
+layout(binding = 1) readonly buffer tensorInB { float inB[]; };
+layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
+
+layout(push_constant) uniform parameter {
+  uint inAOff;
+  uint inBOff;
+  uint outOff;
+  int ne00;
+  int ne01;
+  int ne02;
+  int ne11;
+  int ne12;
+  uint nb01;
+  uint nb02;
+  uint nb11;
+  uint nb12;
+  uint nb1;
+  uint nb2;
+}
+pcs;
+
+#define ELS_PER_BLOCK 32
+#define M_OFFSET 2
+#define QS_OFFSET 4
+#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+
+void main() {
+  uvec3 gid = gl_GlobalInvocationID;
+
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  float sum = 0.0f;
+  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
+    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
+      const uint block_number = i / ELS_PER_BLOCK;
+      const uint block_offset = block_number * BLOCK_SIZE;
+      const float d = u8BufToFloat16(inA, x + block_offset);
+      const float m = u8BufToFloat16(inA, x + block_offset + M_OFFSET);
+      const uint byte_position_in_block = j;
+      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F);
+      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4);
+      const float dq0 = (d * q0) + m;
+      const float dq1 = (d * q1) + m;
+      sum += (dq0 * float(inB[y+i+j])) + \
+             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
+    }
+  }
+
+  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+}

From 8dc79ac380942a8a0006ff7123d1b126130cba3c Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Thu, 12 Oct 2023 11:46:30 -0700
Subject: [PATCH 42/93] clean up vulkan/cpu switch

---
 llama.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0ff459ba5..3afbebe2a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3855,19 +3855,11 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute) { // && N == 1) {
+    if (lctx.ctx_kompute) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
-        if (lctx.ctx_kompute) {
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
-        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-        if (lctx.ctx_kompute) {
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
-        }
     }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);

From 9bc52ebae313c028c2293c260d12d0d0049c5ea1 Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Fri, 13 Oct 2023 11:10:02 -0700
Subject: [PATCH 43/93] attempted speedups

---
 ggml-vulkan.cpp                  |  9 ++++----
 kompute/op_mul_mat_mat_q4_0.comp | 38 ++++++++++++++++++--------------
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 56f15310d..67270a3c7 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1186,17 +1186,18 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01)/32,
+        {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
-        {},
+        {local_x, 4},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/32,
+        s_algo->setWorkgroup({unsigned(ne01),
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
index aecd04cca..80a1ff627 100644
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ b/kompute/op_mul_mat_mat_q4_0.comp
@@ -14,7 +14,9 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-layout(local_size_x = 32) in;
+layout (local_size_x_id = 0) in;
+layout (local_size_y_id = 1) in;
+layout (constant_id = 1) const uint nsg = 2;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -38,34 +40,38 @@ layout(push_constant) uniform parameter {
 }
 pcs;
 
-#define ELS_PER_BLOCK 32
-#define QS_OFFSET 2
-#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
+const uint els_per_block = 32;
+const uint qs_offset = 2;
+const uint block_size = (els_per_block / 2) + qs_offset;
+
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
+  uvec3 lid = gl_LocalInvocationID;
+  gid.y = gid.y * nsg + lid.y;
 
   uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
   uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
 
-
   const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
   const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
-    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
+  for (uint i = gl_SubgroupInvocationID * 2; i < pcs.ne00; i+=gl_SubgroupSize * 2) {
+      const uint block_number = i / els_per_block;
+      const uint block_offset = block_number * block_size;
       const float d = u8BufToFloat16(inA, x + block_offset);
+      const uint j = (i % els_per_block) / 2;
       const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F) - 8;
-      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4) - 8;
+      const int q0 = (inA[x+block_offset+qs_offset+byte_position_in_block] & 0x0F) - 8;
+      const int q1 = (inA[x+block_offset+qs_offset+byte_position_in_block] >>   4) - 8;
       const float dq0 = d * q0;
       const float dq1 = d * q1;
-      sum += (dq0 * float(inB[y+i+j])) + \
-             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
-    }
+      const uint block_base = block_number * els_per_block;
+      sum += (dq0 * float(inB[y+block_base+j])) + \
+             (dq1 * float(inB[y+block_base+j+(els_per_block/2)]));
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect())
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
 }
\ No newline at end of file

From c1fd64548d2c8d42eaedae940c619a6cf2d9741f Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Fri, 13 Oct 2023 13:14:36 -0700
Subject: [PATCH 44/93] attempted speedups 2

---
 ggml-vulkan.cpp                  | 24 +++++++++++++-----------
 kompute/op_mul_mat_mat_f16.comp  | 12 ++++++++----
 kompute/op_mul_mat_mat_f32.comp  | 21 ++++++++++++++-------
 kompute/op_mul_mat_mat_q6_k.comp |  2 +-
 4 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 67270a3c7..010f49226 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -989,26 +989,27 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
         nb1, nb2
     };
 
+    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        //std::cerr << "init f32 matmat shader" << std::endl;
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
         {unsigned(ne01),
          unsigned(ne11),
-         unsigned(ne12)},
-        {},
+         unsigned(std::max(ne12, ne02))
+         },
+        {local_x},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01),
                               unsigned(ne11),
-                              unsigned(std::max(ne12, ne02))});
+                              unsigned(std::max(ne12, ne02)),
+                              });
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
-    //seq.record<kp::OpTensorFill>({out});
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
@@ -1038,15 +1039,16 @@ void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
         nb1, nb2
     };
 
+    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
     if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
+        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
         {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))
          },
-        {},
+        {local_x},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
@@ -1141,7 +1143,7 @@ void ggml_vk_mul_mat_mat_q6_k(
     if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
         {inA, inB, out}, spirv,
-        {unsigned(ne01)/32,
+        {unsigned(ne01)/256,
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))
          },
@@ -1150,7 +1152,7 @@ void ggml_vk_mul_mat_mat_q6_k(
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/32,
+        s_algo->setWorkgroup({unsigned(ne01)/256,
                               unsigned(ne11),
                               unsigned(std::max(ne12, ne02)),
                               });
@@ -1192,7 +1194,7 @@ void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
         {unsigned(ne01),
          unsigned(ne11),
          unsigned(std::max(ne12, ne02))},
-        {local_x, 4},
+        {local_x, 1},
         {pushConsts});
     } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
index b62f06d10..03872fed5 100644
--- a/kompute/op_mul_mat_mat_f16.comp
+++ b/kompute/op_mul_mat_mat_f16.comp
@@ -14,7 +14,8 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+// device subgroup size
+layout (local_size_x_id = 0) in;
 
 layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -40,7 +41,7 @@ pcs;
 
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
 
   uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
   uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
@@ -48,9 +49,12 @@ void main() {
   const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
   const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i ++) {
+  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
       sum += float(inA[x+i]) * float(inB[y+i]);
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect()) {
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
+  }
 }
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
index 6234322ca..a2dba0560 100644
--- a/kompute/op_mul_mat_mat_f32.comp
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -14,7 +14,8 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-// layout(local_size_x = 8) in;
+// device subgroup size
+layout (local_size_x_id = 0) in;
 
 layout(binding = 0) readonly buffer tensorInA { float inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };
@@ -40,14 +41,20 @@ pcs;
 
 
 void main() {
-  uvec3 gid = gl_GlobalInvocationID;
+  uvec3 gid = gl_WorkGroupID;
 
-  const uint x = (gid.x*pcs.nb01 + gid.z/(pcs.ne12/pcs.ne02)*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + gid.z/(pcs.ne02/pcs.ne12)*pcs.nb12) / 4 + pcs.inBOff; // based from inB
+  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
+  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
+
+  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA
+  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
   float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i ++) {
+  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
       sum += float(inA[x+i]) * float(inB[y+i]);
   }
 
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
+  const float all_sum = subgroupAdd(sum);
+  if (subgroupElect()) {
+    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
+  }
+}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
index 127f17df6..8e3e44d7d 100644
--- a/kompute/op_mul_mat_mat_q6_k.comp
+++ b/kompute/op_mul_mat_mat_q6_k.comp
@@ -14,7 +14,7 @@
 #extension GL_KHR_shader_subgroup_arithmetic : require
 #extension GL_EXT_debug_printf : enable
 
-layout(local_size_x = 32) in;
+layout(local_size_x = 256) in;
 
 layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 layout(binding = 1) readonly buffer tensorInB { float inB[]; };

From cc05a602d6e20d514330cd424cb45438ff87f9ea Mon Sep 17 00:00:00 2001
From: Aaron Miller <apage43@ninjawhale.com>
Date: Mon, 16 Oct 2023 10:00:25 -0700
Subject: [PATCH 45/93] use mat*vec shaders for mat*mat

I wrote the mat*mat shaders from scratch so I understand them better but
they are currently not faster than just multiply-invoking the mat*vec
shaders, by a significant degree - so, except for f32 which needed a new
shader, revert to the m*v ones here.
---
 CMakeLists.txt                   |  10 -
 ggml-vulkan.cpp                  | 330 +++----------------------------
 kompute/op_mul_mat_mat_f16.comp  |  60 ------
 kompute/op_mul_mat_mat_q4_0.comp |  77 --------
 kompute/op_mul_mat_mat_q4_1.comp |  73 -------
 kompute/op_mul_mat_mat_q6_k.comp |  88 ---------
 kompute/op_mul_mat_mat_q8_0.comp |  66 -------
 7 files changed, 27 insertions(+), 677 deletions(-)
 delete mode 100644 kompute/op_mul_mat_mat_f16.comp
 delete mode 100644 kompute/op_mul_mat_mat_q4_0.comp
 delete mode 100644 kompute/op_mul_mat_mat_q4_1.comp
 delete mode 100644 kompute/op_mul_mat_mat_q6_k.comp
 delete mode 100644 kompute/op_mul_mat_mat_q8_0.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df6b53dce..33a8bdd17 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -479,12 +479,7 @@ if (LLAMA_KOMPUTE)
           kompute/op_norm.comp
           kompute/op_rmsnorm.comp
           kompute/op_diagmask.comp
-          kompute/op_mul_mat_mat_f16.comp
           kompute/op_mul_mat_mat_f32.comp
-          kompute/op_mul_mat_mat_q4_0.comp
-          kompute/op_mul_mat_mat_q4_1.comp
-          kompute/op_mul_mat_mat_q8_0.comp
-          kompute/op_mul_mat_mat_q6_k.comp
           kompute/op_mul_mat_f16.comp
           kompute/op_mul_mat_q8_0.comp
           kompute/op_mul_mat_q4_0.comp
@@ -515,12 +510,7 @@ if (LLAMA_KOMPUTE)
           shaderop_norm.h
           shaderop_rmsnorm.h
           shaderop_diagmask.h
-          shaderop_mul_mat_mat_f16.h
           shaderop_mul_mat_mat_f32.h
-          shaderop_mul_mat_mat_q4_0.h
-          shaderop_mul_mat_mat_q4_1.h
-          shaderop_mul_mat_mat_q8_0.h
-          shaderop_mul_mat_mat_q6_k.h
           shaderop_mul_mat_f16.h
           shaderop_mul_mat_q8_0.h
           shaderop_mul_mat_q4_0.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 010f49226..08042330f 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -28,11 +28,6 @@
 #include "shaderop_mul_mat_q4_1.h"
 #include "shaderop_mul_mat_q6_k.h"
 #include "shaderop_mul_mat_mat_f32.h"
-#include "shaderop_mul_mat_mat_f16.h"
-#include "shaderop_mul_mat_mat_q4_0.h"
-#include "shaderop_mul_mat_mat_q4_1.h"
-#include "shaderop_mul_mat_mat_q8_0.h"
-#include "shaderop_mul_mat_mat_q6_k.h"
 #include "shaderop_getrows_f16.h"
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
@@ -1013,219 +1008,6 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_mat_f16(kp::Sequence& seq,
-                          const std::shared_ptr<kp::Tensor>& inA,
-                          const std::shared_ptr<kp::Tensor>& inB,
-                          const std::shared_ptr<kp::Tensor>& out,
-                          uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f16_comp_spv,
-        kp::shader_data::op_mul_mat_mat_f16_comp_spv_len);
-
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {local_x},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-
-void ggml_vk_mul_mat_mat_q8_0(
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q8_0_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q8_0_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-void ggml_vk_mul_mat_mat_q6_k(
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q6_k_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q6_k_comp_spv_len);
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01)/256,
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))
-         },
-        {},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01)/256,
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-void ggml_vk_mul_mat_mat_q4_x(const std::vector<uint32_t>& spirv,
-                         kp::Sequence& seq,
-                         const std::shared_ptr<kp::Tensor>& inA,
-                         const std::shared_ptr<kp::Tensor>& inB,
-                         const std::shared_ptr<kp::Tensor>& out,
-                         uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                         int32_t ne00, int32_t ne01, int32_t ne02,
-                         uint32_t nb01, uint32_t nb02,
-                         int32_t ne11, int32_t ne12,
-                         uint32_t nb11, uint32_t nb12,
-                         uint32_t nb1, uint32_t nb2) {
-    struct PushConstants {
-        uint32_t inAOff, inBOff, outOff;
-        int32_t ne00, ne01, ne02, ne11, ne12;
-        uint32_t nb01, nb02;
-        uint32_t nb11, nb12;
-        uint32_t nb1, nb2;
-    } pushConsts {
-        inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
-        ne00, ne01, ne02, ne11, ne12,
-        nb01, nb02, nb11, nb12,
-        nb1, nb2
-    };
-
-    std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__)) {
-        const uint32_t local_x = ggml_vk_current_device().subgroupSize;
-        s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(),
-        {inA, inB, out}, spirv,
-        {unsigned(ne01),
-         unsigned(ne11),
-         unsigned(std::max(ne12, ne02))},
-        {local_x, 1},
-        {pushConsts});
-    } else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({unsigned(ne01),
-                              unsigned(ne11),
-                              unsigned(std::max(ne12, ne02)),
-                              });
-        s_algo->setPushConstants<PushConstants>({pushConsts});
-        s_algo->updateDescriptors(s_kompute_context->pool.get());
-    }
-    seq.record<kp::OpAlgoDispatch>(s_algo);
-}
-
-
-template <typename... Args>
-void ggml_vk_mul_mat_mat_q4_0(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_0_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q4_0_comp_spv_len);
-
-    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-void ggml_vk_mul_mat_mat_q4_1(Args&&... args) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_q4_1_comp_spv,
-        kp::shader_data::op_mul_mat_mat_q4_1_comp_spv_len);
-
-    ggml_vk_mul_mat_mat_q4_x(spirv, std::forward<Args>(args)...);
-}
-
 void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
@@ -1635,54 +1417,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             goto not_implemented;
                         }
 
-                        if (!ggml_is_transposed(src0)
-                            && !ggml_is_transposed(src1)
-                            //&& ne00%32 == 0
-                            && ne11 > 1
-                            ) {
-                            switch (src0t) {
-                                case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_mat_f32(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_F16:
-                                    ggml_vk_mul_mat_mat_f16(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q4_0:
-                                    ggml_vk_mul_mat_mat_q4_0(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q4_1:
-                                    ggml_vk_mul_mat_mat_q4_1(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                case GGML_TYPE_Q8_0:
-                                    ggml_vk_mul_mat_mat_q8_0(seq,
+                        if (ggml_is_transposed(src0) ||
+                            ggml_is_transposed(src1)) {
+                            fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                            goto not_implemented;
+                        } 
+
+                        switch (src0t) {        
+                            case GGML_TYPE_F32:
+                                ggml_vk_mul_mat_mat_f32(seq,
                                         id_src0, id_src1, id_dst,
                                         off_src0, off_src1, off_dst,
                                         ne00, ne01, ne02,
@@ -1690,46 +1433,27 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                            case GGML_TYPE_F16:
+                                ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                 break;
-                                case GGML_TYPE_Q6_K:
-                                    ggml_vk_mul_mat_mat_q6_k(seq,
-                                        id_src0, id_src1, id_dst,
-                                        off_src0, off_src1, off_dst,
-                                        ne00, ne01, ne02,
-                                        nb01, nb02,
-                                        ne11, ne12,
-                                        nb11, nb12,
-                                        nb1, nb2);
-                                    break;
-                                default: {
-                                    fprintf(stderr, "%s: %s: Unsupported quantization for M*M: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                                    goto not_implemented;
-                                }
-                            }
-                        } else {
-                            switch (src0t) {
-                                case GGML_TYPE_F16:
-                                case GGML_TYPE_F32:
-                                    ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                                    break;
-                                case GGML_TYPE_Q8_0:
-                                    ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
-                                    break;
-                                case GGML_TYPE_Q4_0:
-                                    ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                case GGML_TYPE_Q4_1:
-                                    ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                case GGML_TYPE_Q6_K:
-                                    ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
-                                    break;
-                                default: {
-                                    fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
-                                    goto not_implemented;
-                                }
+                            case GGML_TYPE_Q8_0:
+                                ggml_vk_mul_mat_q8_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
+                                break;
+                            case GGML_TYPE_Q4_0:
+                                ggml_vk_mul_mat_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            case GGML_TYPE_Q4_1:
+                                ggml_vk_mul_mat_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            case GGML_TYPE_Q6_K:
+                                ggml_vk_mul_mat_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02);
+                                break;
+                            default: {
+                                fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
+                                goto not_implemented;
                             }
                         }
+                        
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
diff --git a/kompute/op_mul_mat_mat_f16.comp b/kompute/op_mul_mat_mat_f16.comp
deleted file mode 100644
index 03872fed5..000000000
--- a/kompute/op_mul_mat_mat_f16.comp
+++ /dev/null
@@ -1,60 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-// device subgroup size
-layout (local_size_x_id = 0) in;
-
-layout(binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 2 + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
-      sum += float(inA[x+i]) * float(inB[y+i]);
-  }
-
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect()) {
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-  }
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q4_0.comp b/kompute/op_mul_mat_mat_q4_0.comp
deleted file mode 100644
index 80a1ff627..000000000
--- a/kompute/op_mul_mat_mat_q4_0.comp
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout (local_size_x_id = 0) in;
-layout (local_size_y_id = 1) in;
-layout (constant_id = 1) const uint nsg = 2;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-const uint els_per_block = 32;
-const uint qs_offset = 2;
-const uint block_size = (els_per_block / 2) + qs_offset;
-
-
-void main() {
-  uvec3 gid = gl_WorkGroupID;
-  uvec3 lid = gl_LocalInvocationID;
-  gid.y = gid.y * nsg + lid.y;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = gl_SubgroupInvocationID * 2; i < pcs.ne00; i+=gl_SubgroupSize * 2) {
-      const uint block_number = i / els_per_block;
-      const uint block_offset = block_number * block_size;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const uint j = (i % els_per_block) / 2;
-      const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+qs_offset+byte_position_in_block] & 0x0F) - 8;
-      const int q1 = (inA[x+block_offset+qs_offset+byte_position_in_block] >>   4) - 8;
-      const float dq0 = d * q0;
-      const float dq1 = d * q1;
-      const uint block_base = block_number * els_per_block;
-      sum += (dq0 * float(inB[y+block_base+j])) + \
-             (dq1 * float(inB[y+block_base+j+(els_per_block/2)]));
-  }
-
-  const float all_sum = subgroupAdd(sum);
-  if (subgroupElect())
-    out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum;
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q4_1.comp b/kompute/op_mul_mat_mat_q4_1.comp
deleted file mode 100644
index d7fbc96db..000000000
--- a/kompute/op_mul_mat_mat_q4_1.comp
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout(local_size_x = 32) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-#define ELS_PER_BLOCK 32
-#define M_OFFSET 2
-#define QS_OFFSET 4
-#define BLOCK_SIZE ((ELS_PER_BLOCK / 2) + QS_OFFSET)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i+=ELS_PER_BLOCK) {
-    for (uint j = 0; j < ELS_PER_BLOCK / 2; j++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const float m = u8BufToFloat16(inA, x + block_offset + M_OFFSET);
-      const uint byte_position_in_block = j;
-      const int q0 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] & 0x0F);
-      const int q1 = (inA[x+block_offset+QS_OFFSET+byte_position_in_block] >>   4);
-      const float dq0 = (d * q0) + m;
-      const float dq1 = (d * q1) + m;
-      sum += (dq0 * float(inB[y+i+j])) + \
-             (dq1 * float(inB[y+i+j+(ELS_PER_BLOCK/2)]));
-    }
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
diff --git a/kompute/op_mul_mat_mat_q6_k.comp b/kompute/op_mul_mat_mat_q6_k.comp
deleted file mode 100644
index 8e3e44d7d..000000000
--- a/kompute/op_mul_mat_mat_q6_k.comp
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-layout(local_size_x = 256) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-
-#define ELS_PER_BLOCK 256  //QK_K
-#define QH_OFFSET (ELS_PER_BLOCK / 2)
-#define QSCALES_OFFSET (QH_OFFSET + (ELS_PER_BLOCK / 4))
-#define SCALE_SCALE_OFFSET (QSCALES_OFFSET + (ELS_PER_BLOCK / 16))
-#define BLOCK_SIZE (SCALE_SCALE_OFFSET + 2)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-
-  float sum = 0.0f;
-  const uint n_blocks = pcs.ne00 / ELS_PER_BLOCK;
-  // this is pretty much all lifted right from dequantize_row_q6_K
-  uint outoff = 0;
-  for (uint i = 0; i < n_blocks; i++) {
-    const uint block_number = i;
-    const uint block_offset = block_number * BLOCK_SIZE;
-    const float scales_d = u8BufToFloat16(inA, x + block_offset + SCALE_SCALE_OFFSET);
-    uint qloff = block_offset;
-    uint qhoff = block_offset + QH_OFFSET;
-    uint scoff = block_offset + QSCALES_OFFSET;
-    for (int n = 0; n < 256; n += 128) {
-        for (int l = 0; l < 32; ++l) {
-            int is = l/16;
-            const int q1 = int((inA[x + qloff + l +  0] & 0xF) | (((inA[x + qhoff + l] >> 0) & 3) << 4)) - 32;
-            const int q2 = int((inA[x + qloff + l + 32] & 0xF) | (((inA[x + qhoff + l] >> 2) & 3) << 4)) - 32;
-            const int q3 = int((inA[x + qloff + l +  0]  >> 4) | (((inA[x + qhoff + l] >> 4) & 3) << 4)) - 32;
-            const int q4 = int((inA[x + qloff + l + 32]  >> 4) | (((inA[x + qhoff + l] >> 6) & 3) << 4)) - 32;
-            sum += inB[y + outoff + l +  0] * scales_d * int8_t(inA[x + scoff + is + 0]) * q1;
-            sum += inB[y + outoff + l + 32] * scales_d * int8_t(inA[x + scoff + is + 2]) * q2;
-            sum += inB[y + outoff + l + 64] * scales_d * int8_t(inA[x + scoff + is + 4]) * q3;
-            sum += inB[y + outoff + l + 96] * scales_d * int8_t(inA[x + scoff + is + 6]) * q4;
-        }
-        outoff += 128;
-        qloff += 64;
-        qhoff += 32;
-        scoff += 8;
-    }
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
\ No newline at end of file
diff --git a/kompute/op_mul_mat_mat_q8_0.comp b/kompute/op_mul_mat_mat_q8_0.comp
deleted file mode 100644
index 715e533e2..000000000
--- a/kompute/op_mul_mat_mat_q8_0.comp
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#include "common.comp"
-
-#extension GL_KHR_shader_subgroup_arithmetic : require
-#extension GL_EXT_debug_printf : enable
-
-// layout(local_size_x = 8) in;
-
-layout(binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout(binding = 1) readonly buffer tensorInB { float inB[]; };
-layout(binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout(push_constant) uniform parameter {
-  uint inAOff;
-  uint inBOff;
-  uint outOff;
-  int ne00;
-  int ne01;
-  int ne02;
-  int ne11;
-  int ne12;
-  uint nb01;
-  uint nb02;
-  uint nb11;
-  uint nb12;
-  uint nb1;
-  uint nb2;
-}
-pcs;
-
-#define ELS_PER_BLOCK 32
-#define QS_OFFSET 2 // d
-#define BLOCK_SIZE (ELS_PER_BLOCK + 2)
-
-void main() {
-  uvec3 gid = gl_GlobalInvocationID;
-
-  uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z;
-  uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z;
-
-
-  const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) + pcs.inAOff; // Based from inA
-  const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB
-  float sum = 0.0f;
-  for (uint i = 0; i < pcs.ne00; i++) {
-      const uint block_number = i / ELS_PER_BLOCK;
-      const uint block_offset = block_number * BLOCK_SIZE;
-      const float d = u8BufToFloat16(inA, x + block_offset);
-      const uint position_in_block = i % ELS_PER_BLOCK;
-      const int q0 = int8_t(inA[x+block_offset+QS_OFFSET+position_in_block]);
-      const float dq0 = d * q0;
-      sum += (dq0 * float(inB[y+i]));
-  }
-
-  out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = sum;
-}
\ No newline at end of file

From 21841d31635b34a03d63d762af726f1dfae1ca4e Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 16 Oct 2023 16:51:41 -0400
Subject: [PATCH 46/93] kompute : enable kp_logger and make it static (#8)

---
 CMakeLists.txt                    | 1 +
 kompute/src/logger/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33a8bdd17..d26aedaf3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -463,6 +463,7 @@ if (LLAMA_KOMPUTE)
 
     if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
         message(STATUS "Kompute found")
+        set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
         add_subdirectory(kompute)
 
         # Compile our shaders
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
index 1dcc1e6b5..1f8695acd 100644
--- a/kompute/src/logger/CMakeLists.txt
+++ b/kompute/src/logger/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.20)
 
 set(LOGGER_SOURCES Logger.cpp)
 
-add_library(kp_logger ${LOGGER_SOURCES})
+add_library(kp_logger STATIC ${LOGGER_SOURCES})
 
 # Define log levels in code
 add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)

From cbc0d1af797304d9bf5c27cc0ce8c01064e9d78c Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 23 Oct 2023 11:46:26 -0400
Subject: [PATCH 47/93] kompute : make scripts executable

---
 kompute/scripts/convert_shaders.py | 1 +
 undump.py                          | 1 +
 2 files changed, 2 insertions(+)
 mode change 100644 => 100755 kompute/scripts/convert_shaders.py
 mode change 100644 => 100755 undump.py

diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
old mode 100644
new mode 100755
index 9375b6701..11a3ab974
--- a/kompute/scripts/convert_shaders.py
+++ b/kompute/scripts/convert_shaders.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 """
     Script to handle conversion of compute shaders to spirv and to headers
 """
diff --git a/undump.py b/undump.py
old mode 100644
new mode 100755
index db19ffe69..c3d8993be
--- a/undump.py
+++ b/undump.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import struct
 import numpy as np
 from pathlib import Path

From 8400015337705461ecfae335683d265015a4a613 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 13:00:53 -0400
Subject: [PATCH 48/93] Don't try an allocation on a heap that is smaller than
 the size we require.

---
 ggml-vulkan.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 08042330f..265933832 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -364,6 +364,12 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     bool memoryTypeIndexFound = false;
     vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties();
     for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) {
+        const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i];
+        const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex];
+        if (memoryHeap.size < size) {
+            continue;
+        }
+
         if (requirements.memoryTypeBits & (1 << i)) {
             if (((memoryProperties.memoryTypes[i]).propertyFlags &
                  flags) == flags) {

From 752f7ebd61510a24000704cec7332c842d935588 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 13:01:40 -0400
Subject: [PATCH 49/93] Remove unused push constant that was giving validation
 errors.

---
 kompute/op_mul.comp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 348eae7b3..31849b941 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -20,7 +20,6 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
-    uint row;
 } pcs;
 
 void main() {

From 8d9efbf97a0bcfd9fa60a2279a8a45866ce932c8 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Thu, 26 Oct 2023 11:48:36 -0400
Subject: [PATCH 50/93] Lower the workgroup count for some shaders by providing
 a loop that processes four floats at a time.

---
 ggml-vulkan.cpp      | 16 ++++++++--------
 kompute/op_add.comp  |  9 ++++++---
 kompute/op_gelu.comp |  9 ++++++---
 kompute/op_mul.comp  |  7 +++++--
 kompute/op_relu.comp |  7 +++++--
 kompute/op_silu.comp | 10 +++++++---
 6 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 265933832..b70b7ac45 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1358,7 +1358,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
                         } else {
-                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst));
+                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
                     } break;
                 case GGML_OP_MUL:
@@ -1367,7 +1367,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             // src1 is a row
                             ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
                         } else {
-                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst));
+                            ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
                     } break;
                 case GGML_OP_SCALE:
@@ -1379,15 +1379,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     switch (ggml_get_unary_op(gf->nodes[i])) {
                         case GGML_UNARY_OP_SILU:
                             {
-                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         case GGML_UNARY_OP_RELU:
                             {
-                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         case GGML_UNARY_OP_GELU:
                             {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst));
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
                             } break;
                         default:
                             {
@@ -1427,9 +1427,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             ggml_is_transposed(src1)) {
                             fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t);
                             goto not_implemented;
-                        } 
+                        }
 
-                        switch (src0t) {        
+                        switch (src0t) {
                             case GGML_TYPE_F32:
                                 ggml_vk_mul_mat_mat_f32(seq,
                                         id_src0, id_src1, id_dst,
@@ -1459,7 +1459,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                 goto not_implemented;
                             }
                         }
-                        
+
                     } break;
                 case GGML_OP_GET_ROWS:
                     {
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index f242864dd..314116aac 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -23,7 +23,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
-}
\ No newline at end of file
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[i + pcs.inBOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index c9f8ce3cf..f74a14f7e 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -20,8 +20,11 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
-    const float x = in_[i + pcs.inOff];
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        const float y = in_[i + pcs.inOff];
+        out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y)));
+    }
 }
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 31849b941..662ea8177 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -23,7 +23,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i) + pcs.inBOff];
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index 41f46be96..c6ed044a3 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -20,7 +20,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]);
+    }
 }
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index c5acac281..8c7bfe321 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -19,8 +19,12 @@ layout(push_constant) uniform PushConstants {
     uint outOff;
 } pcs;
 void main() {
-    const uint i = gl_WorkGroupID.x;
-    const float x = in_[i + pcs.inOff];
 
-    out_[i + pcs.outOff] = x / (1.0 + exp(-x));
+    const uint baseIndex = gl_WorkGroupID.x * 4;
+
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        const float y = in_[i + pcs.inOff];
+        out_[i + pcs.outOff] = y / (1.0 + exp(-y));
+    }
 }

From 74ddf0f17da1daf83de6aaf4ef22274068dcd72f Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 12:05:24 -0400
Subject: [PATCH 51/93] Fix synchronization problem for AMD Radeon with amdvlk
 driver or windows drivers. Does not have any performance or fidelity effect
 on other gpu/driver combos I've tested.

FIXES: https://github.com/nomic-ai/gpt4all/issues/1507
---
 kompute/src/OpAlgoDispatch.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
index cad334f0c..dc39cdc3f 100644
--- a/kompute/src/OpAlgoDispatch.cpp
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -32,9 +32,9 @@ OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
          this->mAlgorithm->getTensors()) {
         tensor->recordPrimaryBufferMemoryBarrier(
           commandBuffer,
-          vk::AccessFlagBits::eTransferWrite,
+          vk::AccessFlagBits::eShaderWrite,
           vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eTransfer,
+          vk::PipelineStageFlagBits::eComputeShader,
           vk::PipelineStageFlagBits::eComputeShader);
     }
 

From 1c1701018861810d3db0c746df12a00915d4a6dc Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Mon, 23 Oct 2023 12:22:27 -0400
Subject: [PATCH 52/93] vulkan : fix missing break in matmul selection (#9)

---
 ggml-vulkan.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index b70b7ac45..4747850cf 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1439,6 +1439,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                                         ne11, ne12,
                                         nb11, nb12,
                                         nb1, nb2);
+                                break;
                             case GGML_TYPE_F16:
                                 ggml_vk_mul_mat_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, ne0, ne1);
                                 break;

From 89b71278ff2543658a366fc3259802f1183e8aab Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Fri, 27 Oct 2023 19:04:26 -0400
Subject: [PATCH 53/93] llama : decide to disable Vulkan before loading tensors
 (#7)

---
 llama.cpp | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 3afbebe2a..cb0a1227a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2407,7 +2407,7 @@ static bool llama_model_load(
         llama_model & model,
         int n_ctx,
         int n_batch,
-        int n_gpu_layers,
+        int * n_gpu_layers,
         int main_gpu,
         const float * tensor_split,
         const bool mul_mat_q,
@@ -2438,8 +2438,23 @@ static bool llama_model_load(
             return true;
         }
 
+#ifdef GGML_USE_KOMPUTE
+        if (ggml_vk_has_device() && *n_gpu_layers > 0 && (
+            !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON)
+            || !(
+                model.ftype == LLAMA_FTYPE_ALL_F32 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+                model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
+            )
+        )) {
+            // disable Vulkan due to unsupported model architecture or quantization type
+            *n_gpu_layers = 0;
+        }
+#endif
+
         llm_load_tensors(
-                *ml, model, n_batch, n_gpu_layers,
+                *ml, model, n_batch, *n_gpu_layers,
                 main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
                 use_mlock, progress_callback, progress_callback_user_data);
     } catch (const std::exception & err) {
@@ -6354,7 +6369,7 @@ struct llama_model * llama_load_model_from_file(
         };
     }
 
-    if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
+    if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, &params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
                 params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
                 params.progress_callback, params.progress_callback_user_data)) {
@@ -6502,12 +6517,7 @@ struct llama_context * llama_new_context_with_model(
 #undef LLAMA_METAL_CHECK_BUF
         }
 #elif defined(GGML_USE_KOMPUTE)
-    if (ggml_vk_has_device() && params.n_gpu_layers > 0
-        && (model->arch == LLM_ARCH_LLAMA || model->arch == LLM_ARCH_FALCON)
-        && (model->ftype == LLAMA_FTYPE_ALL_F32
-            || model->ftype == LLAMA_FTYPE_MOSTLY_F16
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_0
-            || model->ftype == LLAMA_FTYPE_MOSTLY_Q4_1)) {
+    if (ggml_vk_has_device() && params.n_gpu_layers > 0) {
         // this allocates all Vulkan resources and memory buffers
         ctx->ctx_kompute = ggml_vk_init();
 

From e006d377dd32cce14ecf2f272305b16b516db906 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 18:32:29 -0400
Subject: [PATCH 54/93] Scale the workgroup count down to allow correct
 generation for falcon with AMD radeon cards with lower workgroup count limit

Partially fixes #1581
---
 ggml-vulkan.cpp           | 8 ++++----
 kompute/op_addrow.comp    | 9 ++++++---
 kompute/op_gelu.comp      | 4 ++--
 kompute/op_mulrow.comp    | 7 +++++--
 kompute/op_scale.comp     | 7 +++++--
 kompute/op_silu.comp      | 2 +-
 kompute/src/Algorithm.cpp | 4 ++++
 7 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 4747850cf..239f913f5 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1356,7 +1356,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                            ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
                             ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
@@ -1365,7 +1365,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     {
                         if (ggml_nelements(src1) == ne10) {
                             // src1 is a row
-                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst), ne00);
+                            ggml_vk_mulrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
                             ggml_vk_mul(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
                         }
@@ -1373,7 +1373,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8, scale);
                     } break;
                 case GGML_OP_UNARY:
                     switch (ggml_get_unary_op(gf->nodes[i])) {
@@ -1387,7 +1387,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                             } break;
                         case GGML_UNARY_OP_GELU:
                             {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
+                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8);
                             } break;
                         default:
                             {
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index 926c929e4..bf674f829 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -24,7 +24,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
-}
\ No newline at end of file
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
+    }
+}
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index f74a14f7e..1412ee1ab 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -20,9 +20,9 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint baseIndex = gl_WorkGroupID.x * 8;
 
-    for (uint x = 0; x < 4; x++) {
+    for (uint x = 0; x < 8; x++) {
         const uint i = baseIndex + x;
         const float y = in_[i + pcs.inOff];
         out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y)));
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 498dbdfcd..955fe26bf 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -24,7 +24,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 4;
 
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+    for (uint x = 0; x < 4; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = inA[i + pcs.inAOff] * inB[(i % pcs.row) + pcs.inBOff];
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index 8530aaf3e..2ec524435 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -22,7 +22,10 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint baseIndex = gl_WorkGroupID.x * 8;
 
-    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    for (uint x = 0; x < 8; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    }
 }
\ No newline at end of file
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 8c7bfe321..9233fd5a1 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -18,8 +18,8 @@ layout(push_constant) uniform PushConstants {
     uint inOff;
     uint outOff;
 } pcs;
-void main() {
 
+void main() {
     const uint baseIndex = gl_WorkGroupID.x * 4;
 
     for (uint x = 0; x < 4; x++) {
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index ea81fd97b..f8f1c7e36 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -387,6 +387,10 @@ Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
 void
 Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
 {
+    if (workgroup[0] > 65535) {
+        fprintf(stderr, "workgroup size is %d\n", workgroup[0]);
+        fflush(stderr);
+    }
 
     KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
 

From a5eb001eab32554ea73f1027c323473699ea68aa Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Fri, 27 Oct 2023 18:32:51 -0400
Subject: [PATCH 55/93] Revert the prompt processing on gpu for now.

Fixes issues #1580 and #1581
---
 llama.cpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cb0a1227a..a196b428f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3486,7 +3486,7 @@ static struct ggml_cgraph * llm_build_falcon(
     ggml_build_forward_expand(gf, cur);
 
     ggml_free(ctx0);
- 
+
 #if defined(GGML_USE_KOMPUTE)
     if (lctx.ctx_kompute) {
         if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
@@ -3870,11 +3870,19 @@ static bool llama_eval_internal(
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute) {
+    if (lctx.ctx_kompute && N == 1) {
         ggml_vk_graph_compute(lctx.ctx_kompute, gf);
         ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
     } else {
+        if (lctx.ctx_kompute) {
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
+        }
         ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
+        if (lctx.ctx_kompute) {
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v);
+        }
     }
 #else
     ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);

From ffd0624be2d9e2c908c1fe9d21feb2a0b2f59ae2 Mon Sep 17 00:00:00 2001
From: Adam Treat <treat.adam@gmail.com>
Date: Mon, 30 Oct 2023 11:38:21 -0400
Subject: [PATCH 56/93] Remove this debug code.

---
 kompute/src/Algorithm.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index f8f1c7e36..0378591bd 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -387,11 +387,6 @@ Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
 void
 Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
 {
-    if (workgroup[0] > 65535) {
-        fprintf(stderr, "workgroup size is %d\n", workgroup[0]);
-        fflush(stderr);
-    }
-
     KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
 
     // The dispatch size is set up based on either explicitly provided template

From f88b19888514a1f2d4f3f0b854cb59dda674c081 Mon Sep 17 00:00:00 2001
From: cebtenzzre <cebtenzzre@gmail.com>
Date: Wed, 1 Nov 2023 09:46:15 -0400
Subject: [PATCH 57/93] llama : fix Vulkan whitelist (#11)

---
 llama.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a196b428f..5fc93bd2e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6352,9 +6352,11 @@ int64_t llama_time_us(void) {
     return ggml_time_us();
 }
 
-struct llama_model * llama_load_model_from_file(
-                             const char * path_model,
-            struct llama_context_params   params) {
+static struct llama_model * llama_load_model_from_file_internal(
+    const char * path_model, struct llama_context_params * params_p
+) {
+    auto & params = *params_p;
+
     ggml_time_init();
 
     llama_model * model = new llama_model;
@@ -6389,6 +6391,10 @@ struct llama_model * llama_load_model_from_file(
     return model;
 }
 
+struct llama_model * llama_load_model_from_file(const char * path_model, struct llama_context_params params) {
+    return llama_load_model_from_file_internal(path_model, &params);
+}
+
 void llama_free_model(struct llama_model * model) {
     delete model;
 }
@@ -6559,7 +6565,7 @@ struct llama_context * llama_new_context_with_model(
 static struct llama_context * llama_init_from_file(
                              const char * path_model,
             struct llama_context_params   params) {
-    struct llama_model * model = llama_load_model_from_file(path_model, params);
+    struct llama_model * model = llama_load_model_from_file_internal(path_model, &params);
     if (!model) {
         return nullptr;
     }

From a8cac53207ceeeb28a63bb0e141cb75fa6db4028 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 6 Nov 2023 17:24:14 -0500
Subject: [PATCH 58/93] kompute : fix issues with debug layers

---
 kompute/src/Manager.cpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2a3ad2cc9..2d2370f63 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -180,6 +180,16 @@ Manager::createInstance()
           applicationExtensions.data();
     }
 
+    try {
+        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
+    } catch (const std::exception & err) {
+        return;
+    }
+
+    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
+      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+
 #ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
     KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
     // We'll identify the layers that are supported
@@ -234,16 +244,6 @@ Manager::createInstance()
     }
 #endif
 
-    try {
-        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
-    } catch (const std::exception & err) {
-        return;
-    }
-
-    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-
     this->mInstance = std::make_shared<vk::Instance>();
     vk::Result r = vk::createInstance(
       &computeInstanceCreateInfo, nullptr, this->mInstance.get());
@@ -270,7 +270,7 @@ Manager::createInstance()
           (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
         debugCreateInfo.flags = debugFlags;
 
-        this->mDebugDispatcher.init(*this->mInstance, &vkGetInstanceProcAddr);
+        this->mDebugDispatcher.init(*this->mInstance, vkGetInstanceProcAddr);
         this->mDebugReportCallback =
           this->mInstance->createDebugReportCallbackEXT(
             debugCreateInfo, nullptr, this->mDebugDispatcher);

From c438c168969fa1c5f9dc362d9bca2fa42444766e Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 6 Nov 2023 21:08:48 -0500
Subject: [PATCH 59/93] fix build with external fmtlib (v10)

Co-authored-by: ToKiNoBug <tokinobug@163.com>
---
 kompute/src/Manager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index 2d2370f63..c5060b1ea 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -349,7 +349,7 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
 
     KP_LOG_INFO("Using physical device index {} found {}",
                 physicalDeviceIndex,
-                physicalDeviceProperties.deviceName);
+                physicalDeviceProperties.deviceName.data());
 
     if (familyQueueIndices.empty()) {
         // Find compute queue

From 71565eb0c3f2b26b17685ce184bb78a47d89cc15 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:27 -0500
Subject: [PATCH 60/93] vulkan : replace ggml_diag_mask_inf with ggml_add
 (custom -inf mask)

---
 ggml-vulkan.cpp     | 59 +++++++++++++++++++++++++++++++++------------
 kompute/op_add.comp | 44 +++++++++++++++++++++++++++++----
 2 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 239f913f5..01d70d1a6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -579,29 +579,48 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     return a / b;
 }
 
-void ggml_vk_add(kp::Sequence& seq,
-                    const std::shared_ptr<kp::Tensor>& inA,
-                    const std::shared_ptr<kp::Tensor>& inB,
-                    const std::shared_ptr<kp::Tensor>& out,
-                    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-                    uint32_t size) {
+void ggml_vk_add(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03,
+    int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03,
+    int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
+    int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13,
+    int32_t ne0,
+    int32_t nb0,  int32_t nb1,  int32_t nb2,  int32_t nb3
+) {
 
     const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv,
         kp::shader_data::op_add_comp_spv_len);
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
+        int32_t ne00;
+        int32_t nb00, nb01, nb02, nb03;
+        int32_t ne10, ne11, ne12, ne13;
+        int32_t nb10, nb11, nb12, nb13;
+        int32_t ne0;
+        int32_t nb0, nb1, nb2, nb3;
     } const pushConsts {
-        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4)
+        safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
+        ne00,
+        nb00, nb01, nb02, nb03,
+        ne10, ne11, ne12, ne13,
+        nb10, nb11, nb12, nb13,
+        ne0,
+        nb0, nb1, nb2, nb3
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts});
-    else {
+    if (!komputeManager()->hasAlgorithm(__func__)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({inA, inB, out});
-        s_algo->setWorkgroup({size});
+        s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
     }
@@ -1315,12 +1334,12 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const int32_t ne10 = src1 ? src1->ne[0] : 0;
             const int32_t ne11 = src1 ? src1->ne[1] : 0;
             const int32_t ne12 = src1 ? src1->ne[2] : 0;
-//            const int32_t ne13 = src1 ? src1->ne[3] : 0;
+            const int32_t ne13 = src1 ? src1->ne[3] : 0;
 
-//            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint32_t nb10 = src1 ? src1->nb[0] : 0;
             const uint32_t nb11 = src1 ? src1->nb[1] : 0;
             const uint32_t nb12 = src1 ? src1->nb[2] : 0;
-//            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
+            const uint32_t nb13 = src1 ? src1->nb[3] : 0;
 
             const int32_t ne0 = dst ? dst->ne[0] : 0;
             const int32_t ne1 = dst ? dst->ne[1] : 0;
@@ -1354,11 +1373,19 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_ADD:
                     {
-                        if (ggml_nelements(src1) == ne10) {
+                        if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
-                            ggml_vk_add(seq, id_src0, id_src1, id_dst,  off_src0, off_src1, off_dst, ggml_nelements(dst)/4);
+                            ggml_vk_add(
+                                seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
+                                ne00, ne01, ne02, ne03,
+                                nb00, nb01, nb02, nb03,
+                                ne10, ne11, ne12, ne13,
+                                nb10, nb11, nb12, nb13,
+                                ne0,
+                                nb0, nb1, nb2, nb3
+                            );
                         }
                     } break;
                 case GGML_OP_MUL:
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index 314116aac..df3fdc59c 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -10,7 +10,7 @@
 
 #include "common.comp"
 
-layout(local_size_x = 1) in;
+layout(local_size_x = 1024) in;
 
 layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
@@ -20,13 +20,47 @@ layout(push_constant) uniform PushConstants {
     uint inAOff;
     uint inBOff;
     uint outOff;
+    int ne00;
+    int nb00;
+    int nb01;
+    int nb02;
+    int nb03;
+    int ne10;
+    int ne11;
+    int ne12;
+    int ne13;
+    int nb10;
+    int nb11;
+    int nb12;
+    int nb13;
+    int ne0;
+    int nb0;
+    int nb1;
+    int nb2;
+    int nb3;
 } pcs;
 
+// general-purpose kernel for addition of two tensors
+// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3
+// cons: not very efficient
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 4;
+    const uint i03 = gl_WorkGroupID.z;
+    const uint i02 = gl_WorkGroupID.y;
+    const uint i01 = gl_WorkGroupID.x;
 
-    for (uint x = 0; x < 4; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[i + pcs.inBOff];
+    const uint i13 = i03 % pcs.ne13;
+    const uint i12 = i02 % pcs.ne12;
+    const uint i11 = i01 % pcs.ne11;
+
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + gl_SubgroupInvocationID.x*pcs.nb00) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 + gl_SubgroupInvocationID.x*pcs.nb10) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + gl_SubgroupInvocationID.x*pcs.nb0 ) / 4);
+
+    for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
+        out_[pcs.outOff + dst_off] = inA[pcs.inAOff + src0_off] + inB[pcs.inBOff + src1_off];
+
+        src0_off += gl_WorkGroupSize.x*pcs.ne00;
+        src1_off += gl_WorkGroupSize.x*pcs.ne10;
+        dst_off  += gl_WorkGroupSize.x*pcs.ne0;
     }
 }

From 84f7fc4553775c1d1e8401750ce3369ec1ed70ee Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:42 -0500
Subject: [PATCH 61/93] vulkan : rope n_past is now KQ_pos, f16 rope kernel

---
 CMakeLists.txt                             |  6 +-
 ggml-vulkan.cpp                            | 82 ++++++++++++--------
 kompute/op_rope_f16.comp                   | 89 ++++++++++++++++++++++
 kompute/{op_rope.comp => op_rope_f32.comp} | 23 +++---
 llama.cpp                                  | 16 +++-
 5 files changed, 169 insertions(+), 47 deletions(-)
 create mode 100644 kompute/op_rope_f16.comp
 rename kompute/{op_rope.comp => op_rope_f32.comp} (78%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d26aedaf3..aa453b6b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -490,7 +490,8 @@ if (LLAMA_KOMPUTE)
           kompute/op_getrows_q4_0.comp
           kompute/op_getrows_q4_1.comp
           kompute/op_getrows_q6_k.comp
-          kompute/op_rope.comp
+          kompute/op_rope_f16.comp
+          kompute/op_rope_f32.comp
           kompute/op_cpy_f16_f16.comp
           kompute/op_cpy_f16_f32.comp
           kompute/op_cpy_f32_f16.comp
@@ -521,7 +522,8 @@ if (LLAMA_KOMPUTE)
           shaderop_getrows_q4_0.h
           shaderop_getrows_q4_1.h
           shaderop_getrows_q6_k.h
-          shaderop_rope.h
+          shaderop_rope_f16.h
+          shaderop_rope_f32.h
           shaderop_cpy_f16_f16.h
           shaderop_cpy_f16_f32.h
           shaderop_cpy_f32_f16.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 01d70d1a6..3e3f6cc80 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -32,7 +32,8 @@
 #include "shaderop_getrows_q4_0.h"
 #include "shaderop_getrows_q4_1.h"
 #include "shaderop_getrows_q6_k.h"
-#include "shaderop_rope.h"
+#include "shaderop_rope_f16.h"
+#include "shaderop_rope_f32.h"
 #include "shaderop_cpy_f16_f16.h"
 #include "shaderop_cpy_f16_f32.h"
 #include "shaderop_cpy_f32_f16.h"
@@ -1175,51 +1176,66 @@ void ggml_vk_get_rows_q6_k(Args&&... args) {
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
-void ggml_vk_rope(kp::Sequence& seq,
-                  const std::shared_ptr<kp::Tensor>& in,
-                  const std::shared_ptr<kp::Tensor>& out,
-                  uint32_t inOff, uint32_t outOff,
-                  uint32_t n_past, int32_t n_dims, int32_t mode,
-                  float freq_base, float freq_scale,
-                  int32_t ne01, int32_t ne02, int32_t ne03,
-                  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
-                  int32_t ne0,
-                  uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_rope_comp_spv,
-        kp::shader_data::op_rope_comp_spv_len);
+void ggml_vk_rope(
+    kp::Sequence& seq,
+    const std::shared_ptr<kp::Tensor>& inA,
+    const std::shared_ptr<kp::Tensor>& inB,
+    const std::shared_ptr<kp::Tensor>& out,
+    uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
+    ggml_type src0t, int32_t n_dims, int32_t mode,
+    float freq_base, float freq_scale,
+    int32_t ne01, int32_t ne02, int32_t ne03,
+    uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
+    int32_t ne0,
+    uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3
+) {
+    GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
 
-    GGML_ASSERT(nb03%sizeof(float) == 0);
-    GGML_ASSERT(nb02%sizeof(float) == 0);
-    GGML_ASSERT(nb01%sizeof(float) == 0);
-    GGML_ASSERT(nb00%sizeof(float) == 0);
-    GGML_ASSERT(nb3%sizeof(float) == 0);
-    GGML_ASSERT(nb2%sizeof(float) == 0);
-    GGML_ASSERT(nb1%sizeof(float) == 0);
-    GGML_ASSERT(nb0%sizeof(float) == 0);
+    static const auto spirv_f16 = getSpirvShader(
+        kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
+    );
+    static const auto spirv_f32 = getSpirvShader(
+        kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
+    );
+
+    int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
+
+    GGML_ASSERT(nb03 % type_size == 0);
+    GGML_ASSERT(nb02 % type_size == 0);
+    GGML_ASSERT(nb01 % type_size == 0);
+    GGML_ASSERT(nb00 % type_size == 0);
+    GGML_ASSERT(nb3  % type_size == 0);
+    GGML_ASSERT(nb2  % type_size == 0);
+    GGML_ASSERT(nb1  % type_size == 0);
+    GGML_ASSERT(nb0  % type_size == 0);
 
     struct PushConstants {
-        uint32_t inOff, outOff;
-        uint32_t n_past;
+        uint32_t inAOff, inBOff, outOff;
         int32_t n_dims, mode;
         float freq_base, freq_scale;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
-        safe_divide(inOff, 4), safe_divide(outOff, 4),
-        n_past, n_dims, mode,
+        safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
+        n_dims, mode,
         freq_base, freq_scale,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
     };
 
+    auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
-        s_algo->setTensors({in, out});
+    if (!komputeManager()->hasAlgorithm(name)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(
+            name, s_kompute_context->pool.get(), {inA, inB, out},
+            src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
+            {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
+        );
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
+        s_algo->setTensors({inA, inB, out});
         s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
         s_algo->setPushConstants<PushConstants>({pushConsts});
         s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1506,14 +1522,16 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_ROPE:
                     {
-                        const int n_past = ((int32_t *) dst->op_params)[0];
+                        GGML_ASSERT(ne10 == ne02);
+                        GGML_ASSERT(src0t == dstt);
+                        // const int n_past = ((int32_t *) dst->op_params)[0];
                         const int n_dims = ((int32_t *) dst->op_params)[1];
                         const int mode   = ((int32_t *) dst->op_params)[2];
                         float freq_base;
                         float freq_scale;
                         memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
                         memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-                        ggml_vk_rope(seq, id_src0, id_dst, off_src0, off_dst, n_past, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                        ggml_vk_rope(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
                     } break;
                 case GGML_OP_DUP:
                 case GGML_OP_CPY:
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
new file mode 100644
index 000000000..fd3943c81
--- /dev/null
+++ b/kompute/op_rope_f16.comp
@@ -0,0 +1,89 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+// TODO: use a local size of 32 or more (Metal uses 1024)
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int n_dims;
+    int mode;
+    float freq_base;
+    float freq_scale;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+void main() {
+    const uint i3 = gl_WorkGroupID.z;
+    const uint i2 = gl_WorkGroupID.y;
+    const uint i1 = gl_WorkGroupID.x;
+
+    const bool is_neox = (pcs.mode & 2) != 0;
+    const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
+
+    const int p = inB[pcs.inBOff + i2];
+
+    float theta = pcs.freq_scale * float(p);
+
+    if (!is_neox) {
+        for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
+            const float cos_theta = cos(theta);
+            const float sin_theta = sin(theta);
+
+            theta *= theta_scale;
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+1]);
+
+            out_[dst_data]   = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
+        }
+    } else {
+        const float inv_ndims = -1.f/pcs.n_dims;
+        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
+            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+                const float cos_theta = cos(theta);
+                const float sin_theta = sin(theta);
+
+                theta *= theta_scale;
+
+                const uint i0 = ib*pcs.n_dims + ic/2;
+
+                const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+                const float x0 = float(inA[src]);
+                const float x1 = float(inA[src+pcs.n_dims/2]);
+
+                out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+                out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+            }
+        }
+    }
+}
diff --git a/kompute/op_rope.comp b/kompute/op_rope_f32.comp
similarity index 78%
rename from kompute/op_rope.comp
rename to kompute/op_rope_f32.comp
index 8c2854636..6024c3e5e 100644
--- a/kompute/op_rope.comp
+++ b/kompute/op_rope_f32.comp
@@ -12,13 +12,14 @@
 
 layout(local_size_x = 1) in;
 
-layout (binding = 0) readonly buffer tensorIn { float in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { float out_[]; };
+layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
+layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
+layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
 layout (push_constant) uniform parameter {
-    uint inOff;
+    uint inAOff;
+    uint inBOff;
     uint outOff;
-    uint n_past;
     int n_dims;
     int mode;
     float freq_base;
@@ -42,7 +43,7 @@ void main() {
     const bool is_neox = (pcs.mode & 2) != 0;
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
-    const uint p = ((pcs.mode & 1) == 0 ? pcs.n_past + i2 : i2);
+    const int p = inB[pcs.inBOff + i2];
 
     float theta = pcs.freq_scale * float(p);
 
@@ -53,11 +54,11 @@ void main() {
 
             theta *= theta_scale;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
             const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-            const float x0 = in_[src];
-            const float x1 = in_[src+1];
+            const float x0 = inA[src];
+            const float x1 = inA[src+1];
 
             out_[dst_data] = x0*cos_theta - x1*sin_theta;
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
@@ -73,11 +74,11 @@ void main() {
 
                 const uint i0 = ib*pcs.n_dims + ic/2;
 
-                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inOff; // Based from in
+                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
                 const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-                const float x0 = in_[src];
-                const float x1 = in_[src+pcs.n_dims/2];
+                const float x0 = inA[src];
+                const float x1 = inA[src+pcs.n_dims/2];
 
                 out_[dst_data] = x0*cos_theta - x1*sin_theta;
                 out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
diff --git a/llama.cpp b/llama.cpp
index a56ffce9f..8455424b4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2772,8 +2772,9 @@ static struct ggml_cgraph * llm_build_llama(
     }
 
     // shift the entire K-cache if needed
+    struct ggml_tensor * K_shift = nullptr;
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
         ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
@@ -3024,6 +3025,11 @@ static struct ggml_cgraph * llm_build_llama(
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_pos);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+            if (K_shift) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+            }
         }
     }
 #endif
@@ -3589,8 +3595,9 @@ static struct ggml_cgraph * llm_build_falcon(
     }
 
     // shift the entire K-cache if needed
+    struct ggml_tensor * K_shift = nullptr;
     if (do_rope_shift) {
-        struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
+        K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
         offload_func_kq(K_shift);
         ggml_set_name(K_shift, "K_shift");
         ggml_allocr_alloc(lctx.alloc, K_shift);
@@ -3820,6 +3827,11 @@ static struct ggml_cgraph * llm_build_falcon(
             ggml_vk_h2d_all(lctx.ctx_kompute);
         } else {
             ggml_vk_h2d_tensor(lctx.ctx_kompute, toDeviceTensor);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_pos);
+            ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+            if (K_shift) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+            }
         }
     }
 #endif

From 39abedd1d75b83cc9ff6f5c951d2e4f63d840bdf Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 17:18:48 -0500
Subject: [PATCH 62/93] vulkan : optimize workgroup sizes

---
 ggml-vulkan.cpp             |  4 ++--
 kompute/op_cpy_f16_f16.comp |  5 ++---
 kompute/op_cpy_f16_f32.comp |  5 ++---
 kompute/op_cpy_f32_f16.comp |  5 ++---
 kompute/op_cpy_f32_f32.comp |  5 ++---
 kompute/op_norm.comp        | 18 ++++++++----------
 kompute/op_rmsnorm.comp     | 12 +++++-------
 kompute/op_rope_f32.comp    |  1 +
 8 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 3e3f6cc80..74d9fceb6 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -847,9 +847,9 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
     };
 
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
+    if (!komputeManager()->hasAlgorithm(__func__)) {
         s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts});
-    else {
+    } else {
         s_algo = komputeManager()->getAlgorithm(__func__);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({(uint32_t)nrows});
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 5f425ae28..652db0313 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index 4298bebdd..aa204248c 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float16_t
 #define IN_TYPE_SIZE 2
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index 2d763edfd..4fdab4831 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -10,13 +10,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float16_t
 #define OUT_TYPE_SIZE 2
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -54,7 +53,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute/op_cpy_f32_f32.comp
index 4e5b1d393..2fc998492 100644
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -2,13 +2,12 @@
 
 #include "common.comp"
 
-#define nth 32
 #define IN_TYPE float
 #define IN_TYPE_SIZE 4
 #define OUT_TYPE float
 #define OUT_TYPE_SIZE 4
 
-layout(local_size_x = nth) in;
+layout(local_size_x = 1024) in;
 
 layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
 layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
@@ -46,7 +45,7 @@ void main() {
 
     const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
 
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
         out_[dst_data+i00] = OUT_TYPE(in_[src]);
     }
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 5aafeaac5..1d685cf36 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 256
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 256) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
     // MEAN
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,21 +55,21 @@ void main() {
 
     // recenter
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] - mean;
     }
 
     // VARIANCE
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -88,7 +86,7 @@ void main() {
     const float variance = sum[0];
 
     const float scale = 1.0f/sqrt(variance + pcs.eps);
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] *= scale;
     }
 }
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 8d6c0fa6a..5ebaf2269 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -10,9 +10,7 @@
 
 #include "common.comp"
 
-#define nth 512
-
-layout(local_size_x = nth) in;
+layout(local_size_x = 512) in;
 
 layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
 layout(binding = 1) buffer restrict tensorOut { float out_[]; };
@@ -25,21 +23,21 @@ layout(push_constant) uniform PushConstants {
     float eps;
 } pcs;
 
-shared float sum[nth];
+shared float sum[gl_WorkGroupSize.x];
 
 void main() {
     const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_
 
     // parallel sum
     sum[gl_LocalInvocationID.x] = 0.0;
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00];
     }
 
     // reduce
     barrier();
     memoryBarrierShared();
-    [[unroll]] for (uint i = nth/2; i > 0; i /= 2) {
+    [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) {
         if (gl_LocalInvocationID.x < i) {
             sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i];
         }
@@ -57,7 +55,7 @@ void main() {
     const float scale = 1.0f/sqrt(sum[0] + pcs.eps);
 
     const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
+    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) {
         out_[y+i00] = in_[x+i00] * scale;
     }
 }
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 6024c3e5e..0cf83fec0 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -10,6 +10,7 @@
 
 #include "common.comp"
 
+// TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };

From a934b2cb8a1cbe2aad1ca10a119df60bbcf8d5d1 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 11:59:58 -0500
Subject: [PATCH 63/93] vulkan : assert various kernel requirements

---
 ggml-vulkan.cpp | 47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 74d9fceb6..d4d6d1b87 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1416,27 +1416,34 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8, scale);
+                        int64_t n = ggml_nelements(dst);
+                        GGML_ASSERT(n % 8 == 0);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, n/8, scale);
                     } break;
                 case GGML_OP_UNARY:
-                    switch (ggml_get_unary_op(gf->nodes[i])) {
-                        case GGML_UNARY_OP_SILU:
-                            {
-                                ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
-                            } break;
-                        case GGML_UNARY_OP_RELU:
-                            {
-                                ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/4);
-                            } break;
-                        case GGML_UNARY_OP_GELU:
-                            {
-                                ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst)/8);
-                            } break;
-                        default:
-                            {
-                                fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                GGML_ASSERT(false);
-                            }
+                    {
+                        int64_t n = ggml_nelements(dst);
+                        GGML_ASSERT(n % 4 == 0);
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4);
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    GGML_ASSERT(n % 8 == 0);
+                                    ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8);
+                                } break;
+                            default:
+                                {
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
+                        }
                     } break;
                 case GGML_OP_SOFT_MAX:
                     {
@@ -1455,6 +1462,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_RMS_NORM:
                     {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
                         float eps;
                         memcpy(&eps, dst->op_params, sizeof(float));
                         ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps);

From 6474fc879ac708daa22f7ac80337f9b4a323b387 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 12:10:52 -0500
Subject: [PATCH 64/93] vulkan : handle ggml_scale for n%8 != 0

ref ggerganov/llama.cpp#3754
---
 CMakeLists.txt          |  2 ++
 ggml-vulkan.cpp         | 29 ++++++++++++++++++++---------
 kompute/op_scale.comp   | 10 +++-------
 kompute/op_scale_8.comp | 31 +++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 16 deletions(-)
 create mode 100644 kompute/op_scale_8.comp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 39dd95eb0..76a03d95f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -476,6 +476,7 @@ if (LLAMA_KOMPUTE)
         # Compile our shaders
         compile_shader(SOURCES
           kompute/op_scale.comp
+          kompute/op_scale_8.comp
           kompute/op_add.comp
           kompute/op_addrow.comp
           kompute/op_mul.comp
@@ -508,6 +509,7 @@ if (LLAMA_KOMPUTE)
         # Create a custom target for our generated shaders
         add_custom_target(generated_shaders DEPENDS
           shaderop_scale.h
+          shaderop_scale_8.h
           shaderop_add.h
           shaderop_addrow.h
           shaderop_mul.h
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index d4d6d1b87..8c048c77d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -11,6 +11,7 @@
 
 // These are generated at build time by cmake custom command
 #include "shaderop_scale.h"
+#include "shaderop_scale_8.h"
 #include "shaderop_add.h"
 #include "shaderop_addrow.h"
 #include "shaderop_mul.h"
@@ -724,8 +725,12 @@ void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
                    uint32_t size, float scale) {
-    const static auto spirv = getSpirvShader(kp::shader_data::op_scale_comp_spv,
-        kp::shader_data::op_scale_comp_spv_len);
+    const static auto spirv_1 = getSpirvShader(
+        kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len
+    );
+    const static auto spirv_8 = getSpirvShader(
+        kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len
+    );
 
     struct PushConstants {
         uint32_t inOff, outOff;
@@ -735,11 +740,19 @@ void ggml_vk_scale(kp::Sequence& seq,
         scale
     };
 
+    const auto * spirv = &spirv_1;
+    std::string name(__func__);
+    if (size % 8 == 0) {
+        size /= 8;
+        name += "_8";
+        spirv = &spirv_8;
+    }
+
     std::shared_ptr<kp::Algorithm> s_algo = nullptr;
-    if (!komputeManager()->hasAlgorithm(__func__))
-        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts});
-    else {
-        s_algo = komputeManager()->getAlgorithm(__func__);
+    if (!komputeManager()->hasAlgorithm(name)) {
+        s_algo = komputeManager()->algorithm<float, PushConstants>(__func__, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts});
+    } else {
+        s_algo = komputeManager()->getAlgorithm(name);
         s_algo->setTensors({in, out});
         s_algo->setWorkgroup({size});
         s_algo->setPushConstants<PushConstants>({pushConsts});
@@ -1416,9 +1429,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_SCALE:
                     {
                         const float scale = *(const float *) src1->data;
-                        int64_t n = ggml_nelements(dst);
-                        GGML_ASSERT(n % 8 == 0);
-                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, n/8, scale);
+                        ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
                     } break;
                 case GGML_OP_UNARY:
                     {
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index 2ec524435..be6806091 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -22,10 +22,6 @@ layout(push_constant) uniform PushConstants {
 } pcs;
 
 void main() {
-    const uint baseIndex = gl_WorkGroupID.x * 8;
-
-    for (uint x = 0; x < 8; x++) {
-        const uint i = baseIndex + x;
-        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
-    }
-}
\ No newline at end of file
+    const uint i = gl_WorkGroupID.x;
+    out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+}
diff --git a/kompute/op_scale_8.comp b/kompute/op_scale_8.comp
new file mode 100644
index 000000000..29fa9b35a
--- /dev/null
+++ b/kompute/op_scale_8.comp
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#version 450
+
+#include "common.comp"
+
+layout(local_size_x = 1) in;
+
+layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
+layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
+
+layout(push_constant) uniform PushConstants {
+    uint inOff;
+    uint outOff;
+    float scale;
+} pcs;
+
+void main() {
+    const uint baseIndex = gl_WorkGroupID.x * 8;
+
+    for (uint x = 0; x < 8; x++) {
+        const uint i = baseIndex + x;
+        out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale;
+    }
+}

From 9c4dfd06e8172486678a37e66ff5b1a47c8b88f6 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 15 Nov 2023 15:51:55 -0500
Subject: [PATCH 65/93] mention skipped change

---
 kompute/op_softmax.comp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index 30b6f0260..a8c2682dc 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -6,6 +6,8 @@
  * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
  */
 
+// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
+
 #version 450
 
 #include "common.comp"

From 02c3309f6d3f7892803e8b75e1e6ad77d580a79b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 14 Nov 2023 15:54:26 -0500
Subject: [PATCH 66/93] merge fixup (e16b9fa4baa8a09c6619b116159830e898050942)

---
 llama.cpp | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ed6bd18e1..ca170f596 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3506,6 +3506,10 @@ struct llm_build_context {
 
     llama_buffer & buf_compute;
 
+#if defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute;
+#endif
+
     struct ggml_context * ctx0 = nullptr;
 
     // TODO: consider making the entire interface noexcept
@@ -3535,7 +3539,11 @@ struct llm_build_context {
         kv_head       (worst_case ? n_ctx - n_tokens : kv_self.head),
         do_rope_shift (worst_case || kv_self.has_shift),
         cb            (cb),
-        buf_compute   (lctx.buf_compute) {
+        buf_compute   (lctx.buf_compute)
+#if defined(GGML_USE_KOMPUTE)
+      , ctx_kompute   (lctx.ctx_kompute)
+#endif
+        {
             GGML_ASSERT(!!kv_self.ctx);
 
             // all initializations should be done in init()
@@ -3662,15 +3670,15 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
 
 #if defined(GGML_USE_KOMPUTE)
-        if (lctx.ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
-                ggml_vk_h2d_all(lctx.ctx_kompute);
+        if (ctx_kompute) {
+            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
+                ggml_vk_h2d_all(ctx_kompute);
             } else {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
+                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
+                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
                 if (K_shift) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
                 }
             }
         }
@@ -3907,15 +3915,15 @@ struct llm_build_context {
         ggml_build_forward_expand(gf, cur);
 
 #if defined(GGML_USE_KOMPUTE)
-        if (lctx.ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(lctx.ctx_kompute)) {
-                ggml_vk_h2d_all(lctx.ctx_kompute);
+        if (ctx_kompute) {
+            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
+                ggml_vk_h2d_all(ctx_kompute);
             } else {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, KQ_mask);
+                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
+                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
+                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
                 if (K_shift) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, K_shift);
+                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
                 }
             }
         }

From 208cd52f7d2ca3eb9708cfd457dde0592ed0e38b Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 15 Nov 2023 17:58:19 -0500
Subject: [PATCH 67/93] vulkan : implement YaRN RoPE scaling (#2268)

The NeoX cur_rot part is different because I'm pretty sure my original
implementation was wrong.
---
 ggml-vulkan.cpp          | 36 ++++++++++++-------
 kompute/common.comp      |  1 +
 kompute/op_rope_f16.comp | 40 +++++++--------------
 kompute/op_rope_f32.comp | 40 +++++++--------------
 kompute/rope_common.comp | 75 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 123 insertions(+), 69 deletions(-)
 create mode 100644 kompute/rope_common.comp

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 8c048c77d..a4f9ade0e 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1195,8 +1195,8 @@ void ggml_vk_rope(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    ggml_type src0t, int32_t n_dims, int32_t mode,
-    float freq_base, float freq_scale,
+    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
+    float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
     int32_t ne0,
@@ -1224,15 +1224,15 @@ void ggml_vk_rope(
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t n_dims, mode;
-        float freq_base, freq_scale;
+        int32_t n_dims, mode, n_orig_ctx;
+        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
         safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
-        n_dims, mode,
-        freq_base, freq_scale,
+        n_dims, mode, n_orig_ctx,
+        freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
         nb0, nb1, nb2, nb3
@@ -1545,13 +1545,23 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
                         // const int n_past = ((int32_t *) dst->op_params)[0];
-                        const int n_dims = ((int32_t *) dst->op_params)[1];
-                        const int mode   = ((int32_t *) dst->op_params)[2];
-                        float freq_base;
-                        float freq_scale;
-                        memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-                        memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-                        ggml_vk_rope(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, freq_base, freq_scale, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3);
+                        const int n_dims     = ((int32_t *) dst->op_params)[1];
+                        const int mode       = ((int32_t *) dst->op_params)[2];
+                        // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
+                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+                        ggml_vk_rope(
+                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
+                            freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
+                            ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
+                        );
                     } break;
                 case GGML_OP_DUP:
                 case GGML_OP_CPY:
diff --git a/kompute/common.comp b/kompute/common.comp
index 040b87375..fe0bc5d15 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -20,6 +20,7 @@
 
 #define GELU_COEF_A 0.044715
 #define SQRT_2_OVER_PI 0.79788456080286535587989211986876
+#define TWOPI_F 6.283185307179586f
 
 #define QK_K 256
 
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
index fd3943c81..e4b5ccca3 100644
--- a/kompute/op_rope_f16.comp
+++ b/kompute/op_rope_f16.comp
@@ -8,50 +8,32 @@
 
 #version 450
 
-#include "common.comp"
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
+#include "rope_common.comp"
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float16_t inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int       inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
 
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    float freq_base;
-    float freq_scale;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
 void main() {
     const uint i3 = gl_WorkGroupID.z;
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
     const bool is_neox = (pcs.mode & 2) != 0;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
     const int p = inB[pcs.inBOff + i2];
 
-    float theta = pcs.freq_scale * float(p);
+    float theta = float(p);
 
     if (!is_neox) {
         for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
             theta *= theta_scale;
 
@@ -68,8 +50,10 @@ void main() {
         const float inv_ndims = -1.f/pcs.n_dims;
         for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
             for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                const uint cur_rot = ib * pcs.n_dims + ic;
+
+                float cos_theta, sin_theta;
+                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
                 theta *= theta_scale;
 
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 0cf83fec0..0a882879d 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -8,50 +8,32 @@
 
 #version 450
 
-#include "common.comp"
-
-// TODO: use a local size of 32 or more (Metal uses 1024)
-layout(local_size_x = 1) in;
+#include "rope_common.comp"
 
 layout(binding = 0) buffer restrict readonly  tensorInA { float inA[]; };
 layout(binding = 1) buffer restrict readonly  tensorInB { int   inB[]; };
 layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int n_dims;
-    int mode;
-    float freq_base;
-    float freq_scale;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
 void main() {
     const uint i3 = gl_WorkGroupID.z;
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
     const bool is_neox = (pcs.mode & 2) != 0;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
     const int p = inB[pcs.inBOff + i2];
 
-    float theta = pcs.freq_scale * float(p);
+    float theta = float(p);
 
     if (!is_neox) {
         for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
-            const float cos_theta = cos(theta);
-            const float sin_theta = sin(theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
             theta *= theta_scale;
 
@@ -68,8 +50,10 @@ void main() {
         const float inv_ndims = -1.f/pcs.n_dims;
         for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
             for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const float cos_theta = cos(theta);
-                const float sin_theta = sin(theta);
+                const uint cur_rot = ib * pcs.n_dims + ic;
+
+                float cos_theta, sin_theta;
+                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
                 theta *= theta_scale;
 
diff --git a/kompute/rope_common.comp b/kompute/rope_common.comp
new file mode 100644
index 000000000..45682dc28
--- /dev/null
+++ b/kompute/rope_common.comp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) 2023 Nomic, Inc. All rights reserved.
+ *
+ * This software is licensed under the terms of the Software for Open Models License (SOM),
+ * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+ * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+ */
+
+#include "common.comp"
+
+// TODO: use a local size of 32 or more (Metal uses 1024)
+layout(local_size_x = 1) in;
+
+layout (push_constant) uniform parameter {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+    int n_dims;
+    int mode;
+    int n_orig_ctx;
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    uint nb00;
+    uint nb01;
+    uint nb02;
+    uint nb03;
+    int ne0;
+    uint nb0;
+    uint nb1;
+    uint nb2;
+    uint nb3;
+} pcs;
+
+float rope_yarn_ramp(const float low, const float high, const float i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+void rope_yarn(
+    float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale,
+    out float cos_theta, out float sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * log(1.0f / freq_scale);
+    }
+    cos_theta = cos(theta) * mscale;
+    sin_theta = sin(theta) * mscale;
+}
+
+// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
+// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
+float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
+    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
+}
+
+void rope_yarn_corr_dims(
+    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
+) {
+    // start and end correction dims
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+}

From a4bb9c5ced174b306958fb79f11c3b5bfafcf5ea Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 23 Nov 2023 12:20:07 -0500
Subject: [PATCH 68/93] vulkan : sync with "migrate to dynamic graphs"

---
 ggml-vulkan.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a4f9ade0e..a3308191c 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1350,6 +1350,15 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             struct ggml_tensor * dst = gf->nodes[i];
             GGML_ASSERT(dst->data != nullptr);
 
+            switch (dst->op) {
+                case GGML_OP_NONE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    continue; // noop -> next node
+            }
+
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1393,13 +1402,6 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
 
             switch (dst->op) {
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_PERMUTE:
-                    {
-                        // noop
-                    } break;
                 case GGML_OP_ADD:
                     {
                         if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {

From 56430c3209bebbc6547cd13db32c83cc32b5f4ce Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 16:54:06 -0500
Subject: [PATCH 69/93] relicense Vulkan backend as MIT

---
 LICENSE_SOM.txt                           | 30 -----------------------
 ggml-vulkan.cpp                           |  8 ------
 ggml-vulkan.h                             |  8 ------
 kompute/common.comp                       |  8 ------
 kompute/op_add.comp                       |  8 ------
 kompute/op_addrow.comp                    |  8 ------
 kompute/op_cpy_f16_f16.comp               |  8 ------
 kompute/op_cpy_f16_f32.comp               |  8 ------
 kompute/op_cpy_f32_f16.comp               |  8 ------
 kompute/op_diagmask.comp                  |  8 ------
 kompute/op_gelu.comp                      |  8 ------
 kompute/op_getrows.comp                   |  8 ------
 kompute/op_getrows_f16.comp               |  8 ------
 kompute/op_getrows_q4_0.comp              |  8 ------
 kompute/op_getrows_q4_1.comp              |  8 ------
 kompute/op_getrows_q6_k.comp              |  8 ------
 kompute/op_mul.comp                       |  8 ------
 kompute/op_mul_mat_f16.comp               |  8 ------
 kompute/op_mul_mat_mat_f32.comp           |  9 -------
 kompute/op_mul_mat_q4_0.comp              |  8 ------
 kompute/op_mul_mat_q4_1.comp              |  8 ------
 kompute/op_mul_mat_q6_k.comp              |  8 ------
 kompute/op_mul_mat_q8_0.comp              |  8 ------
 kompute/op_mul_mv_q_n.comp                |  8 ------
 kompute/op_mulrow.comp                    |  8 ------
 kompute/op_norm.comp                      |  8 ------
 kompute/op_relu.comp                      |  8 ------
 kompute/op_rmsnorm.comp                   |  8 ------
 kompute/op_rope_f16.comp                  |  8 ------
 kompute/op_rope_f32.comp                  |  8 ------
 kompute/op_scale.comp                     |  8 ------
 kompute/op_scale_8.comp                   |  8 ------
 kompute/op_silu.comp                      |  8 ------
 kompute/op_softmax.comp                   |  8 ------
 kompute/rope_common.comp                  |  8 ------
 kompute/src/Algorithm.cpp                 |  9 -------
 kompute/src/Core.cpp                      |  8 ------
 kompute/src/Manager.cpp                   |  8 ------
 kompute/src/OpAlgoDispatch.cpp            |  8 ------
 kompute/src/OpBufferSyncDevice.cpp        |  8 ------
 kompute/src/OpBufferSyncLocal.cpp         |  8 ------
 kompute/src/OpMemoryBarrier.cpp           |  8 ------
 kompute/src/OpTensorCopy.cpp              |  8 ------
 kompute/src/OpTensorFill.cpp              |  8 ------
 kompute/src/OpTensorSyncDevice.cpp        |  8 ------
 kompute/src/OpTensorSyncLocal.cpp         |  8 ------
 kompute/src/Sequence.cpp                  |  8 ------
 kompute/src/Tensor.cpp                    |  8 ------
 kompute/src/include/kompute/Algorithm.hpp |  9 -------
 kompute/src/include/kompute/Core.hpp      |  9 -------
 kompute/src/include/kompute/Manager.hpp   |  9 -------
 kompute/src/include/kompute/Sequence.hpp  |  9 -------
 kompute/src/include/kompute/Tensor.hpp    |  8 ------
 53 files changed, 452 deletions(-)
 delete mode 100644 LICENSE_SOM.txt

diff --git a/LICENSE_SOM.txt b/LICENSE_SOM.txt
deleted file mode 100644
index eb912c0fd..000000000
--- a/LICENSE_SOM.txt
+++ /dev/null
@@ -1,30 +0,0 @@
-Software for Open Models License (SOM)
-Version 1.0 dated August 30th, 2023
-
-This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
-
-This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
-
-1. Definitions
-The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
-A “Model” is the output of a machine learning algorithm, and excludes the Software.
-“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
-“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
-
-2. Grant of Rights. Subject to the conditions and limitations in section 3:
-(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
-
-(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
-
-3. Conditions and Limitations
-(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
-
-(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
-
-(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
-
-(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
-
-(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
-
-(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index a3308191c..1abf1e699 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "ggml-vulkan.h"
 #include "ggml.h"
 
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
index 7989cfc1f..ac8a4d4a0 100644
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <cstddef>
diff --git a/kompute/common.comp b/kompute/common.comp
index fe0bc5d15..0df6db7d0 100644
--- a/kompute/common.comp
+++ b/kompute/common.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #extension GL_EXT_shader_16bit_storage: require
 #extension GL_EXT_shader_8bit_storage: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
diff --git a/kompute/op_add.comp b/kompute/op_add.comp
index df3fdc59c..c86673452 100644
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_addrow.comp b/kompute/op_addrow.comp
index bf674f829..2376a6b8f 100644
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute/op_cpy_f16_f16.comp
index 652db0313..d57247d2d 100644
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute/op_cpy_f16_f32.comp
index aa204248c..b568bcd7b 100644
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute/op_cpy_f32_f16.comp
index 4fdab4831..99b228343 100644
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_diagmask.comp b/kompute/op_diagmask.comp
index 8dc2cc60a..291c3fc18 100644
--- a/kompute/op_diagmask.comp
+++ b/kompute/op_diagmask.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_gelu.comp b/kompute/op_gelu.comp
index 1412ee1ab..5b547f414 100644
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows.comp b/kompute/op_getrows.comp
index a4d8bb9a0..1a5581b23 100644
--- a/kompute/op_getrows.comp
+++ b/kompute/op_getrows.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 void main() {
     const uint i = gl_WorkGroupID.x;
     const int r = inB[i + pcs.inBOff];
diff --git a/kompute/op_getrows_f16.comp b/kompute/op_getrows_f16.comp
index 3f2b16724..34acbcd70 100644
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q4_0.comp b/kompute/op_getrows_q4_0.comp
index 0449b1987..32b2e891e 100644
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q4_1.comp b/kompute/op_getrows_q4_1.comp
index 64586cdc9..87f2fbe17 100644
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_getrows_q6_k.comp b/kompute/op_getrows_q6_k.comp
index 95817b487..9ce3545d1 100644
--- a/kompute/op_getrows_q6_k.comp
+++ b/kompute/op_getrows_q6_k.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul.comp b/kompute/op_mul.comp
index 662ea8177..d599460c3 100644
--- a/kompute/op_mul.comp
+++ b/kompute/op_mul.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_f16.comp b/kompute/op_mul_mat_f16.comp
index b56d14f77..dd1e13979 100644
--- a/kompute/op_mul_mat_f16.comp
+++ b/kompute/op_mul_mat_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute/op_mul_mat_mat_f32.comp
index a2dba0560..6cc5558b2 100644
--- a/kompute/op_mul_mat_mat_f32.comp
+++ b/kompute/op_mul_mat_mat_f32.comp
@@ -1,12 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models
- * License (SOM), version 1.0, as detailed in the LICENSE_SOM.txt file. A copy
- * of this license should accompany this software. Except as expressly granted
- * in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute/op_mul_mat_q4_0.comp
index 165df3c37..03788c920 100644
--- a/kompute/op_mul_mat_q4_0.comp
+++ b/kompute/op_mul_mat_q4_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute/op_mul_mat_q4_1.comp
index 683b695ca..0ae8f8c7d 100644
--- a/kompute/op_mul_mat_q4_1.comp
+++ b/kompute/op_mul_mat_q4_1.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute/op_mul_mat_q6_k.comp
index 6148053b2..c9baebdf4 100644
--- a/kompute/op_mul_mat_q6_k.comp
+++ b/kompute/op_mul_mat_q6_k.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute/op_mul_mat_q8_0.comp
index 2ba48127b..1c4ddbb08 100644
--- a/kompute/op_mul_mat_q8_0.comp
+++ b/kompute/op_mul_mat_q8_0.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute/op_mul_mv_q_n.comp
index a9b64fe16..8b6e6a2e2 100644
--- a/kompute/op_mul_mv_q_n.comp
+++ b/kompute/op_mul_mv_q_n.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 void main() {
     if (gl_SubgroupInvocationID > 31)
         return;
diff --git a/kompute/op_mulrow.comp b/kompute/op_mulrow.comp
index 955fe26bf..ae7106320 100644
--- a/kompute/op_mulrow.comp
+++ b/kompute/op_mulrow.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_norm.comp b/kompute/op_norm.comp
index 1d685cf36..ad0c3c01b 100644
--- a/kompute/op_norm.comp
+++ b/kompute/op_norm.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_relu.comp b/kompute/op_relu.comp
index c6ed044a3..52a601fe6 100644
--- a/kompute/op_relu.comp
+++ b/kompute/op_relu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_rmsnorm.comp b/kompute/op_rmsnorm.comp
index 5ebaf2269..da658c160 100644
--- a/kompute/op_rmsnorm.comp
+++ b/kompute/op_rmsnorm.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_rope_f16.comp b/kompute/op_rope_f16.comp
index e4b5ccca3..3abe3ed33 100644
--- a/kompute/op_rope_f16.comp
+++ b/kompute/op_rope_f16.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "rope_common.comp"
diff --git a/kompute/op_rope_f32.comp b/kompute/op_rope_f32.comp
index 0a882879d..104ae0ba4 100644
--- a/kompute/op_rope_f32.comp
+++ b/kompute/op_rope_f32.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "rope_common.comp"
diff --git a/kompute/op_scale.comp b/kompute/op_scale.comp
index be6806091..bdae26738 100644
--- a/kompute/op_scale.comp
+++ b/kompute/op_scale.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_scale_8.comp b/kompute/op_scale_8.comp
index 29fa9b35a..ada69754b 100644
--- a/kompute/op_scale_8.comp
+++ b/kompute/op_scale_8.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_silu.comp b/kompute/op_silu.comp
index 9233fd5a1..0fb8e4b74 100644
--- a/kompute/op_silu.comp
+++ b/kompute/op_silu.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #version 450
 
 #include "common.comp"
diff --git a/kompute/op_softmax.comp b/kompute/op_softmax.comp
index a8c2682dc..89de1b701 100644
--- a/kompute/op_softmax.comp
+++ b/kompute/op_softmax.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 // TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4)
 
 #version 450
diff --git a/kompute/rope_common.comp b/kompute/rope_common.comp
index 45682dc28..57ba6597a 100644
--- a/kompute/rope_common.comp
+++ b/kompute/rope_common.comp
@@ -1,11 +1,3 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "common.comp"
 
 // TODO: use a local size of 32 or more (Metal uses 1024)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
index 0378591bd..c2d8554e1 100644
--- a/kompute/src/Algorithm.cpp
+++ b/kompute/src/Algorithm.cpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include <fstream>
 
 #include "kompute/Algorithm.hpp"
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
index 9b0483232..020f44160 100644
--- a/kompute/src/Core.cpp
+++ b/kompute/src/Core.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Core.hpp"
 
 #ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
index c5060b1ea..0c588e19b 100644
--- a/kompute/src/Manager.cpp
+++ b/kompute/src/Manager.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Manager.hpp"
 #include "fmt/format.h"
 #include "kompute/logger/Logger.hpp"
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
index dc39cdc3f..edc0f6eb6 100644
--- a/kompute/src/OpAlgoDispatch.cpp
+++ b/kompute/src/OpAlgoDispatch.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpAlgoDispatch.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
index baaafda0f..1812d04b2 100644
--- a/kompute/src/OpBufferSyncDevice.cpp
+++ b/kompute/src/OpBufferSyncDevice.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpBufferSyncDevice.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
index 63739a351..a829819fa 100644
--- a/kompute/src/OpBufferSyncLocal.cpp
+++ b/kompute/src/OpBufferSyncLocal.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpBufferSyncLocal.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
index 89d44d85e..1f075a3c4 100644
--- a/kompute/src/OpMemoryBarrier.cpp
+++ b/kompute/src/OpMemoryBarrier.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpMemoryBarrier.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
index e732cc413..1eaf428b8 100644
--- a/kompute/src/OpTensorCopy.cpp
+++ b/kompute/src/OpTensorCopy.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorCopy.hpp"
 #include "kompute/Tensor.hpp"
 
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
index da477dcc7..bda7d6040 100644
--- a/kompute/src/OpTensorFill.cpp
+++ b/kompute/src/OpTensorFill.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorFill.hpp"
 #include "kompute/Tensor.hpp"
 
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
index 4cc6abf71..b563529ea 100644
--- a/kompute/src/OpTensorSyncDevice.cpp
+++ b/kompute/src/OpTensorSyncDevice.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/operations/OpTensorSyncDevice.hpp"
 
 namespace kp {
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
index 1aa091b73..7818db565 100644
--- a/kompute/src/OpTensorSyncLocal.cpp
+++ b/kompute/src/OpTensorSyncLocal.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Tensor.hpp"
 
 #include "kompute/operations/OpTensorSyncLocal.hpp"
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
index 3b5fb5fb5..da3b379a3 100644
--- a/kompute/src/Sequence.cpp
+++ b/kompute/src/Sequence.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Sequence.hpp"
 
 namespace kp {
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
index 65279206d..84dce08e0 100644
--- a/kompute/src/Tensor.cpp
+++ b/kompute/src/Tensor.cpp
@@ -1,13 +1,5 @@
 // SPDX-License-Identifier: Apache-2.0
 
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #include "kompute/Tensor.hpp"
 
 namespace kp {
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
index ef11234ee..e5fef1f56 100644
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ b/kompute/src/include/kompute/Algorithm.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include "kompute/Core.hpp"
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
index 99222cbde..406e6b5d4 100644
--- a/kompute/src/include/kompute/Core.hpp
+++ b/kompute/src/include/kompute/Core.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <vulkan/vulkan.hpp>
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
index e910b2b81..780c352eb 100644
--- a/kompute/src/include/kompute/Manager.hpp
+++ b/kompute/src/include/kompute/Manager.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include <set>
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
index e282242f1..3b29a6e2e 100644
--- a/kompute/src/include/kompute/Sequence.hpp
+++ b/kompute/src/include/kompute/Sequence.hpp
@@ -1,13 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
 #pragma once
 
 #include "kompute/Core.hpp"
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
index 2ab88eb30..20939093d 100644
--- a/kompute/src/include/kompute/Tensor.hpp
+++ b/kompute/src/include/kompute/Tensor.hpp
@@ -1,12 +1,4 @@
 // SPDX-License-Identifier: Apache-2.0
-
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
 #pragma once
 
 #include "kompute/Core.hpp"

From 3e09e127ebba12d175d180d65e6c1da165e8424f Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:10:32 -0500
Subject: [PATCH 70/93] rename ggml-vulkan -> ggml-kompute

---
 CMakeLists.txt                      | 10 +++++-----
 examples/main/main.cpp              |  2 +-
 ggml-vulkan.cpp => ggml-kompute.cpp |  2 +-
 ggml-vulkan.h => ggml-kompute.h     |  0
 llama.cpp                           |  2 +-
 5 files changed, 8 insertions(+), 8 deletions(-)
 rename ggml-vulkan.cpp => ggml-kompute.cpp (99%)
 rename ggml-vulkan.h => ggml-kompute.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76f489691..0e9183625 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -510,15 +510,15 @@ if (LLAMA_KOMPUTE)
 
         # Create a custom command that depends on the generated_shaders
         add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
             DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+            COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
         )
 
         # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
-        set(GGML_HEADERS_KOMPUTE ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        set(GGML_SOURCES_KOMPUTE ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ggml-kompute.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
         add_compile_definitions(GGML_USE_KOMPUTE)
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
         set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index afcb566c4..31cc07434 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -32,7 +32,7 @@
 #endif
 
 #if defined(GGML_USE_KOMPUTE)
-#include "ggml-vulkan.h"
+#include "ggml-kompute.h"
 #endif
 
 static llama_context           ** g_ctx;
diff --git a/ggml-vulkan.cpp b/ggml-kompute.cpp
similarity index 99%
rename from ggml-vulkan.cpp
rename to ggml-kompute.cpp
index 1abf1e699..df8bcca3d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml-vulkan.h"
+#include "ggml-kompute.h"
 #include "ggml.h"
 
 // These are generated at build time by cmake custom command
diff --git a/ggml-vulkan.h b/ggml-kompute.h
similarity index 100%
rename from ggml-vulkan.h
rename to ggml-kompute.h
diff --git a/llama.cpp b/llama.cpp
index f7991b275..97a688f4b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12,7 +12,7 @@
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
 #elif defined(GGML_USE_KOMPUTE)
-#   include "ggml-vulkan.h"
+#   include "ggml-kompute.h"
 #endif
 
 #ifdef GGML_USE_METAL

From 27631dbb6eabfc24a6ec4406967145e46d345542 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:22:19 -0500
Subject: [PATCH 71/93] separate shaders from kompute itself

---
 CMakeLists.txt                                | 69 ++++++++++---------
 {kompute => kompute-shaders}/common.comp      |  0
 {kompute => kompute-shaders}/op_add.comp      |  0
 {kompute => kompute-shaders}/op_addrow.comp   |  0
 .../op_cpy_f16_f16.comp                       |  0
 .../op_cpy_f16_f32.comp                       |  0
 .../op_cpy_f32_f16.comp                       |  0
 .../op_cpy_f32_f32.comp                       |  0
 {kompute => kompute-shaders}/op_diagmask.comp |  0
 {kompute => kompute-shaders}/op_gelu.comp     |  0
 {kompute => kompute-shaders}/op_getrows.comp  |  0
 .../op_getrows_f16.comp                       |  0
 .../op_getrows_q4_0.comp                      |  0
 .../op_getrows_q4_1.comp                      |  0
 .../op_getrows_q6_k.comp                      |  0
 {kompute => kompute-shaders}/op_mul.comp      |  0
 .../op_mul_mat_f16.comp                       |  0
 .../op_mul_mat_mat_f32.comp                   |  0
 .../op_mul_mat_q4_0.comp                      |  0
 .../op_mul_mat_q4_1.comp                      |  0
 .../op_mul_mat_q6_k.comp                      |  0
 .../op_mul_mat_q8_0.comp                      |  0
 .../op_mul_mv_q_n.comp                        |  0
 {kompute => kompute-shaders}/op_mulrow.comp   |  0
 {kompute => kompute-shaders}/op_norm.comp     |  0
 {kompute => kompute-shaders}/op_relu.comp     |  0
 {kompute => kompute-shaders}/op_rmsnorm.comp  |  0
 {kompute => kompute-shaders}/op_rope_f16.comp |  0
 {kompute => kompute-shaders}/op_rope_f32.comp |  0
 {kompute => kompute-shaders}/op_scale.comp    |  0
 {kompute => kompute-shaders}/op_scale_8.comp  |  0
 {kompute => kompute-shaders}/op_silu.comp     |  0
 {kompute => kompute-shaders}/op_softmax.comp  |  0
 {kompute => kompute-shaders}/rope_common.comp |  0
 34 files changed, 35 insertions(+), 34 deletions(-)
 rename {kompute => kompute-shaders}/common.comp (100%)
 rename {kompute => kompute-shaders}/op_add.comp (100%)
 rename {kompute => kompute-shaders}/op_addrow.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f16_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f16_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f32_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_cpy_f32_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_diagmask.comp (100%)
 rename {kompute => kompute-shaders}/op_gelu.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q4_0.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q4_1.comp (100%)
 rename {kompute => kompute-shaders}/op_getrows_q6_k.comp (100%)
 rename {kompute => kompute-shaders}/op_mul.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_mat_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q4_0.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q4_1.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q6_k.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mat_q8_0.comp (100%)
 rename {kompute => kompute-shaders}/op_mul_mv_q_n.comp (100%)
 rename {kompute => kompute-shaders}/op_mulrow.comp (100%)
 rename {kompute => kompute-shaders}/op_norm.comp (100%)
 rename {kompute => kompute-shaders}/op_relu.comp (100%)
 rename {kompute => kompute-shaders}/op_rmsnorm.comp (100%)
 rename {kompute => kompute-shaders}/op_rope_f16.comp (100%)
 rename {kompute => kompute-shaders}/op_rope_f32.comp (100%)
 rename {kompute => kompute-shaders}/op_scale.comp (100%)
 rename {kompute => kompute-shaders}/op_scale_8.comp (100%)
 rename {kompute => kompute-shaders}/op_silu.comp (100%)
 rename {kompute => kompute-shaders}/op_softmax.comp (100%)
 rename {kompute => kompute-shaders}/rope_common.comp (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e9183625..8260dd6cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -403,15 +403,16 @@ if (LLAMA_KOMPUTE)
       set(multiValueArgs SOURCES)
       cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
       foreach(source ${compile_shader_SOURCES})
-        set(spv_file ${source}.spv)
+        get_filename_component(filename ${source} NAME)
+        set(spv_file ${filename}.spv)
         add_custom_command(
             OUTPUT ${spv_file}
             DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/common.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_getrows.comp
-              ${CMAKE_CURRENT_SOURCE_DIR}/kompute/op_mul_mv_q_n.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
+              ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
               COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMENT "Compiling ${source} to ${source}.spv"
+            COMMENT "Compiling ${source} to ${spv_file}"
         )
 
         get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
@@ -444,35 +445,35 @@ if (LLAMA_KOMPUTE)
 
         # Compile our shaders
         compile_shader(SOURCES
-          kompute/op_scale.comp
-          kompute/op_scale_8.comp
-          kompute/op_add.comp
-          kompute/op_addrow.comp
-          kompute/op_mul.comp
-          kompute/op_mulrow.comp
-          kompute/op_silu.comp
-          kompute/op_relu.comp
-          kompute/op_gelu.comp
-          kompute/op_softmax.comp
-          kompute/op_norm.comp
-          kompute/op_rmsnorm.comp
-          kompute/op_diagmask.comp
-          kompute/op_mul_mat_mat_f32.comp
-          kompute/op_mul_mat_f16.comp
-          kompute/op_mul_mat_q8_0.comp
-          kompute/op_mul_mat_q4_0.comp
-          kompute/op_mul_mat_q4_1.comp
-          kompute/op_mul_mat_q6_k.comp
-          kompute/op_getrows_f16.comp
-          kompute/op_getrows_q4_0.comp
-          kompute/op_getrows_q4_1.comp
-          kompute/op_getrows_q6_k.comp
-          kompute/op_rope_f16.comp
-          kompute/op_rope_f32.comp
-          kompute/op_cpy_f16_f16.comp
-          kompute/op_cpy_f16_f32.comp
-          kompute/op_cpy_f32_f16.comp
-          kompute/op_cpy_f32_f32.comp
+          kompute-shaders/op_scale.comp
+          kompute-shaders/op_scale_8.comp
+          kompute-shaders/op_add.comp
+          kompute-shaders/op_addrow.comp
+          kompute-shaders/op_mul.comp
+          kompute-shaders/op_mulrow.comp
+          kompute-shaders/op_silu.comp
+          kompute-shaders/op_relu.comp
+          kompute-shaders/op_gelu.comp
+          kompute-shaders/op_softmax.comp
+          kompute-shaders/op_norm.comp
+          kompute-shaders/op_rmsnorm.comp
+          kompute-shaders/op_diagmask.comp
+          kompute-shaders/op_mul_mat_mat_f32.comp
+          kompute-shaders/op_mul_mat_f16.comp
+          kompute-shaders/op_mul_mat_q8_0.comp
+          kompute-shaders/op_mul_mat_q4_0.comp
+          kompute-shaders/op_mul_mat_q4_1.comp
+          kompute-shaders/op_mul_mat_q6_k.comp
+          kompute-shaders/op_getrows_f16.comp
+          kompute-shaders/op_getrows_q4_0.comp
+          kompute-shaders/op_getrows_q4_1.comp
+          kompute-shaders/op_getrows_q6_k.comp
+          kompute-shaders/op_rope_f16.comp
+          kompute-shaders/op_rope_f32.comp
+          kompute-shaders/op_cpy_f16_f16.comp
+          kompute-shaders/op_cpy_f16_f32.comp
+          kompute-shaders/op_cpy_f32_f16.comp
+          kompute-shaders/op_cpy_f32_f32.comp
         )
 
         # Create a custom target for our generated shaders
diff --git a/kompute/common.comp b/kompute-shaders/common.comp
similarity index 100%
rename from kompute/common.comp
rename to kompute-shaders/common.comp
diff --git a/kompute/op_add.comp b/kompute-shaders/op_add.comp
similarity index 100%
rename from kompute/op_add.comp
rename to kompute-shaders/op_add.comp
diff --git a/kompute/op_addrow.comp b/kompute-shaders/op_addrow.comp
similarity index 100%
rename from kompute/op_addrow.comp
rename to kompute-shaders/op_addrow.comp
diff --git a/kompute/op_cpy_f16_f16.comp b/kompute-shaders/op_cpy_f16_f16.comp
similarity index 100%
rename from kompute/op_cpy_f16_f16.comp
rename to kompute-shaders/op_cpy_f16_f16.comp
diff --git a/kompute/op_cpy_f16_f32.comp b/kompute-shaders/op_cpy_f16_f32.comp
similarity index 100%
rename from kompute/op_cpy_f16_f32.comp
rename to kompute-shaders/op_cpy_f16_f32.comp
diff --git a/kompute/op_cpy_f32_f16.comp b/kompute-shaders/op_cpy_f32_f16.comp
similarity index 100%
rename from kompute/op_cpy_f32_f16.comp
rename to kompute-shaders/op_cpy_f32_f16.comp
diff --git a/kompute/op_cpy_f32_f32.comp b/kompute-shaders/op_cpy_f32_f32.comp
similarity index 100%
rename from kompute/op_cpy_f32_f32.comp
rename to kompute-shaders/op_cpy_f32_f32.comp
diff --git a/kompute/op_diagmask.comp b/kompute-shaders/op_diagmask.comp
similarity index 100%
rename from kompute/op_diagmask.comp
rename to kompute-shaders/op_diagmask.comp
diff --git a/kompute/op_gelu.comp b/kompute-shaders/op_gelu.comp
similarity index 100%
rename from kompute/op_gelu.comp
rename to kompute-shaders/op_gelu.comp
diff --git a/kompute/op_getrows.comp b/kompute-shaders/op_getrows.comp
similarity index 100%
rename from kompute/op_getrows.comp
rename to kompute-shaders/op_getrows.comp
diff --git a/kompute/op_getrows_f16.comp b/kompute-shaders/op_getrows_f16.comp
similarity index 100%
rename from kompute/op_getrows_f16.comp
rename to kompute-shaders/op_getrows_f16.comp
diff --git a/kompute/op_getrows_q4_0.comp b/kompute-shaders/op_getrows_q4_0.comp
similarity index 100%
rename from kompute/op_getrows_q4_0.comp
rename to kompute-shaders/op_getrows_q4_0.comp
diff --git a/kompute/op_getrows_q4_1.comp b/kompute-shaders/op_getrows_q4_1.comp
similarity index 100%
rename from kompute/op_getrows_q4_1.comp
rename to kompute-shaders/op_getrows_q4_1.comp
diff --git a/kompute/op_getrows_q6_k.comp b/kompute-shaders/op_getrows_q6_k.comp
similarity index 100%
rename from kompute/op_getrows_q6_k.comp
rename to kompute-shaders/op_getrows_q6_k.comp
diff --git a/kompute/op_mul.comp b/kompute-shaders/op_mul.comp
similarity index 100%
rename from kompute/op_mul.comp
rename to kompute-shaders/op_mul.comp
diff --git a/kompute/op_mul_mat_f16.comp b/kompute-shaders/op_mul_mat_f16.comp
similarity index 100%
rename from kompute/op_mul_mat_f16.comp
rename to kompute-shaders/op_mul_mat_f16.comp
diff --git a/kompute/op_mul_mat_mat_f32.comp b/kompute-shaders/op_mul_mat_mat_f32.comp
similarity index 100%
rename from kompute/op_mul_mat_mat_f32.comp
rename to kompute-shaders/op_mul_mat_mat_f32.comp
diff --git a/kompute/op_mul_mat_q4_0.comp b/kompute-shaders/op_mul_mat_q4_0.comp
similarity index 100%
rename from kompute/op_mul_mat_q4_0.comp
rename to kompute-shaders/op_mul_mat_q4_0.comp
diff --git a/kompute/op_mul_mat_q4_1.comp b/kompute-shaders/op_mul_mat_q4_1.comp
similarity index 100%
rename from kompute/op_mul_mat_q4_1.comp
rename to kompute-shaders/op_mul_mat_q4_1.comp
diff --git a/kompute/op_mul_mat_q6_k.comp b/kompute-shaders/op_mul_mat_q6_k.comp
similarity index 100%
rename from kompute/op_mul_mat_q6_k.comp
rename to kompute-shaders/op_mul_mat_q6_k.comp
diff --git a/kompute/op_mul_mat_q8_0.comp b/kompute-shaders/op_mul_mat_q8_0.comp
similarity index 100%
rename from kompute/op_mul_mat_q8_0.comp
rename to kompute-shaders/op_mul_mat_q8_0.comp
diff --git a/kompute/op_mul_mv_q_n.comp b/kompute-shaders/op_mul_mv_q_n.comp
similarity index 100%
rename from kompute/op_mul_mv_q_n.comp
rename to kompute-shaders/op_mul_mv_q_n.comp
diff --git a/kompute/op_mulrow.comp b/kompute-shaders/op_mulrow.comp
similarity index 100%
rename from kompute/op_mulrow.comp
rename to kompute-shaders/op_mulrow.comp
diff --git a/kompute/op_norm.comp b/kompute-shaders/op_norm.comp
similarity index 100%
rename from kompute/op_norm.comp
rename to kompute-shaders/op_norm.comp
diff --git a/kompute/op_relu.comp b/kompute-shaders/op_relu.comp
similarity index 100%
rename from kompute/op_relu.comp
rename to kompute-shaders/op_relu.comp
diff --git a/kompute/op_rmsnorm.comp b/kompute-shaders/op_rmsnorm.comp
similarity index 100%
rename from kompute/op_rmsnorm.comp
rename to kompute-shaders/op_rmsnorm.comp
diff --git a/kompute/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
similarity index 100%
rename from kompute/op_rope_f16.comp
rename to kompute-shaders/op_rope_f16.comp
diff --git a/kompute/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
similarity index 100%
rename from kompute/op_rope_f32.comp
rename to kompute-shaders/op_rope_f32.comp
diff --git a/kompute/op_scale.comp b/kompute-shaders/op_scale.comp
similarity index 100%
rename from kompute/op_scale.comp
rename to kompute-shaders/op_scale.comp
diff --git a/kompute/op_scale_8.comp b/kompute-shaders/op_scale_8.comp
similarity index 100%
rename from kompute/op_scale_8.comp
rename to kompute-shaders/op_scale_8.comp
diff --git a/kompute/op_silu.comp b/kompute-shaders/op_silu.comp
similarity index 100%
rename from kompute/op_silu.comp
rename to kompute-shaders/op_silu.comp
diff --git a/kompute/op_softmax.comp b/kompute-shaders/op_softmax.comp
similarity index 100%
rename from kompute/op_softmax.comp
rename to kompute-shaders/op_softmax.comp
diff --git a/kompute/rope_common.comp b/kompute-shaders/rope_common.comp
similarity index 100%
rename from kompute/rope_common.comp
rename to kompute-shaders/rope_common.comp

From b906e126ca1aace9aebf2b705a033a78998e4ef5 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:30:38 -0500
Subject: [PATCH 72/93] kompute : fix compile warnings

---
 ggml-kompute.cpp | 70 +++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index df8bcca3d..f70231bed 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -64,7 +64,7 @@ struct ggml_kompute_context {
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
 ggml_kompute_context *s_kompute_context = nullptr;
-kp::Manager *komputeManager() {
+static kp::Manager *komputeManager() {
     static kp::Manager *s_mgr = nullptr;
     if (s_mgr && !s_mgr->hasInstance()) {
         delete s_mgr;
@@ -551,7 +551,7 @@ void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
     komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
 }
 
-std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
+static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
     if (size % sizeof(uint32_t) != 0) {
         throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
     }
@@ -573,7 +573,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
     return a / b;
 }
 
-void ggml_vk_add(
+static void ggml_vk_add(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -621,7 +621,7 @@ void ggml_vk_add(
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_addrow(kp::Sequence& seq,
+static void ggml_vk_addrow(kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& inA,
                  const std::shared_ptr<kp::Tensor>& inB,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -652,7 +652,7 @@ void ggml_vk_addrow(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul(kp::Sequence& seq,
+static void ggml_vk_mul(kp::Sequence& seq,
                     const std::shared_ptr<kp::Tensor>& inA,
                     const std::shared_ptr<kp::Tensor>& inB,
                     const std::shared_ptr<kp::Tensor>& out,
@@ -681,7 +681,7 @@ void ggml_vk_mul(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mulrow(kp::Sequence& seq,
+static void ggml_vk_mulrow(kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& inA,
                  const std::shared_ptr<kp::Tensor>& inB,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -712,7 +712,7 @@ void ggml_vk_mulrow(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_scale(kp::Sequence& seq,
+static void ggml_vk_scale(kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
@@ -753,7 +753,7 @@ void ggml_vk_scale(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+static void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                   const std::shared_ptr<kp::Tensor>& in,
                   const std::shared_ptr<kp::Tensor>& out,
                   uint32_t inOff, uint32_t outOff,
@@ -778,7 +778,7 @@ void ggml_vk_xxlu(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 }
 
 template <typename... Args>
-void ggml_vk_silu(Args&&... args) {
+static void ggml_vk_silu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv,
         kp::shader_data::op_silu_comp_spv_len);
 
@@ -786,7 +786,7 @@ void ggml_vk_silu(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_relu(Args&&... args) {
+static void ggml_vk_relu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv,
         kp::shader_data::op_relu_comp_spv_len);
 
@@ -794,14 +794,14 @@ void ggml_vk_relu(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_gelu(Args&&... args) {
+static void ggml_vk_gelu(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv,
         kp::shader_data::op_gelu_comp_spv_len);
 
     ggml_vk_xxlu(spirv, std::forward<Args>(args)...);
 }
 
-void ggml_vk_soft_max(kp::Sequence& seq,
+static void ggml_vk_soft_max(kp::Sequence& seq,
                       const std::shared_ptr<kp::Tensor>& in,
                       const std::shared_ptr<kp::Tensor>& out,
                       uint32_t inOff, uint32_t outOff,
@@ -833,7 +833,7 @@ void ggml_vk_soft_max(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
+static void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
                    const std::shared_ptr<kp::Tensor>& in,
                    const std::shared_ptr<kp::Tensor>& out,
                    uint32_t inOff, uint32_t outOff,
@@ -865,7 +865,7 @@ void ggml_vk_norm_(const std::vector<uint32_t>& spirv, kp::Sequence& seq,
 }
 
 template <typename... Args>
-void ggml_vk_norm(Args&&... args) {
+static void ggml_vk_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv,
         kp::shader_data::op_norm_comp_spv_len);
 
@@ -873,14 +873,14 @@ void ggml_vk_norm(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_rms_norm(Args&&... args) {
+static void ggml_vk_rms_norm(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv,
         kp::shader_data::op_rmsnorm_comp_spv_len);
 
     ggml_vk_norm_(spirv, std::forward<Args>(args)...);
 }
 
-void ggml_vk_diag_mask_inf(kp::Sequence& seq,
+static void ggml_vk_diag_mask_inf(kp::Sequence& seq,
                            const std::shared_ptr<kp::Tensor>& in,
                            const std::shared_ptr<kp::Tensor>& out,
                            uint32_t inOff, uint32_t outOff,
@@ -912,7 +912,7 @@ void ggml_vk_diag_mask_inf(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_f16(kp::Sequence& seq,
+static void ggml_vk_mul_mat_f16(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -951,7 +951,7 @@ void ggml_vk_mul_mat_f16(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
+static void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -989,7 +989,7 @@ void ggml_vk_mul_mat_q8_0(kp::Sequence& seq,
 }
 
 
-void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
+static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
                          const std::shared_ptr<kp::Tensor>& inA,
                          const std::shared_ptr<kp::Tensor>& inB,
                          const std::shared_ptr<kp::Tensor>& out,
@@ -1039,7 +1039,7 @@ void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
+static void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_size, kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
@@ -1069,7 +1069,7 @@ void ggml_vk_mul_mat_q4_x(const std::vector<uint32_t>& spirv, uint32_t block_siz
 }
 
 template <typename... Args>
-void ggml_vk_mul_mat_q4_0(Args&&... args) {
+static void ggml_vk_mul_mat_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv,
         kp::shader_data::op_mul_mat_q4_0_comp_spv_len);
 
@@ -1077,14 +1077,14 @@ void ggml_vk_mul_mat_q4_0(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_mul_mat_q4_1(Args&&... args) {
+static void ggml_vk_mul_mat_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv,
         kp::shader_data::op_mul_mat_q4_1_comp_spv_len);
 
     ggml_vk_mul_mat_q4_x(spirv, 1/*We access blocks unaligned*/, std::forward<Args>(args)...);
 }
 
-void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
+static void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
                           const std::shared_ptr<kp::Tensor>& inA,
                           const std::shared_ptr<kp::Tensor>& inB,
                           const std::shared_ptr<kp::Tensor>& out,
@@ -1116,7 +1116,7 @@ void ggml_vk_mul_mat_q6_k(kp::Sequence& seq,
     seq.record<kp::OpAlgoDispatch>(s_algo);
 }
 
-void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
+static void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
                       unsigned element_size, unsigned qk,
                       kp::Sequence& seq,
                       const std::shared_ptr<kp::Tensor>& inA,
@@ -1151,7 +1151,7 @@ void ggml_vk_get_rows(const std::vector<uint32_t>& spirv,
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_f16(Args&&... args) {
+static void ggml_vk_get_rows_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv,
         kp::shader_data::op_getrows_f16_comp_spv_len);
 
@@ -1159,7 +1159,7 @@ void ggml_vk_get_rows_f16(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q4_0(Args&&... args) {
+static void ggml_vk_get_rows_q4_0(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv,
         kp::shader_data::op_getrows_q4_0_comp_spv_len);
 
@@ -1167,7 +1167,7 @@ void ggml_vk_get_rows_q4_0(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q4_1(Args&&... args) {
+static void ggml_vk_get_rows_q4_1(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv,
         kp::shader_data::op_getrows_q4_1_comp_spv_len);
 
@@ -1175,13 +1175,13 @@ void ggml_vk_get_rows_q4_1(Args&&... args) {
 }
 
 template <typename... Args>
-void ggml_vk_get_rows_q6_k(Args&&... args) {
+static void ggml_vk_get_rows_q6_k(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv,
         kp::shader_data::op_getrows_q6_k_comp_spv_len);
     ggml_vk_get_rows(spirv, 1/*We access blocks unaligned*/, QK_NL, std::forward<Args>(args)...);
 }
 
-void ggml_vk_rope(
+static void ggml_vk_rope(
     kp::Sequence& seq,
     const std::shared_ptr<kp::Tensor>& inA,
     const std::shared_ptr<kp::Tensor>& inB,
@@ -1249,7 +1249,7 @@ void ggml_vk_rope(
 }
 
 template<uint32_t in_element_size, uint32_t out_element_size>
-void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
+static void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
                  kp::Sequence& seq,
                  const std::shared_ptr<kp::Tensor>& in,
                  const std::shared_ptr<kp::Tensor>& out,
@@ -1289,28 +1289,28 @@ void ggml_vk_cpy(const std::vector<uint32_t>& spirv,
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f32_f16(Args&&... args) {
+static void ggml_vk_cpy_f32_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv,
         kp::shader_data::op_cpy_f32_f16_comp_spv_len);
     ggml_vk_cpy<4, 2>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f32_f32(Args&&... args) {
+static void ggml_vk_cpy_f32_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv,
         kp::shader_data::op_cpy_f32_f32_comp_spv_len);
     ggml_vk_cpy<4, 4>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f16_f16(Args&&... args) {
+static void ggml_vk_cpy_f16_f16(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv,
         kp::shader_data::op_cpy_f16_f16_comp_spv_len);
     ggml_vk_cpy<2, 2>(spirv, std::forward<Args>(args)...);
 }
 
 template <typename... Args>
-void ggml_vk_cpy_f16_f32(Args&&... args) {
+static void ggml_vk_cpy_f16_f32(Args&&... args) {
     const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv,
         kp::shader_data::op_cpy_f16_f32_comp_spv_len);
     ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
@@ -1349,6 +1349,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                 case GGML_OP_TRANSPOSE:
                 case GGML_OP_PERMUTE:
                     continue; // noop -> next node
+                default:
+                    break;
             }
 
             const int32_t ne00 = src0 ? src0->ne[0] : 0;

From 9af7f58b7beb2a66f8a199758a0cfe74989a45df Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:54:35 -0500
Subject: [PATCH 73/93] move kompute to a submodule

---
 .gitmodules                                   |   3 +
 kompute                                       |   1 +
 kompute/.ccls                                 |  27 -
 kompute/.clang-format                         |   5 -
 kompute/.dockerignore                         |   4 -
 kompute/.github/workflows/cpp_examples.yml    |  58 --
 kompute/.github/workflows/cpp_tests.yml       | 104 ---
 kompute/.github/workflows/python_tests.yml    |  28 -
 kompute/CMakeLists.txt                        | 189 ----
 kompute/LICENSE                               | 203 -----
 kompute/Makefile                              | 210 -----
 kompute/README.md                             | 513 -----------
 kompute/cmake/bin2h.cmake                     | 106 ---
 kompute/cmake/bin_file_to_header.cmake        |  19 -
 kompute/cmake/check_vulkan_version.cmake      | 139 ---
 kompute/cmake/code_coverage.cmake             |  35 -
 kompute/cmake/deprecation_warnings.cmake      |  15 -
 kompute/cmake/komputeConfig.cmake.in          |   8 -
 kompute/cmake/vulkan_shader_compiler.cmake    |  43 -
 kompute/config/FindSphinx.cmake               |  16 -
 kompute/external/bin/xxd.c                    | 819 ------------------
 kompute/kompute-config.cmake                  |  28 -
 kompute/scripts/convert_shaders.py            | 149 ----
 kompute/scripts/requirements.txt              |  11 -
 kompute/setup.py                              |  93 --
 kompute/src/Algorithm.cpp                     | 418 ---------
 kompute/src/CMakeLists.txt                    |  86 --
 kompute/src/Core.cpp                          |  17 -
 kompute/src/Manager.cpp                       | 512 -----------
 kompute/src/OpAlgoDispatch.cpp                |  57 --
 kompute/src/OpBufferSyncDevice.cpp            |  43 -
 kompute/src/OpBufferSyncLocal.cpp             |  43 -
 kompute/src/OpMemoryBarrier.cpp               |  66 --
 kompute/src/OpTensorCopy.cpp                  |  82 --
 kompute/src/OpTensorFill.cpp                  |  47 -
 kompute/src/OpTensorSyncDevice.cpp            |  53 --
 kompute/src/OpTensorSyncLocal.cpp             |  68 --
 kompute/src/Sequence.cpp                      | 388 ---------
 kompute/src/Tensor.cpp                        | 450 ----------
 kompute/src/include/CMakeLists.txt            |  47 -
 kompute/src/include/kompute/Algorithm.hpp     | 330 -------
 kompute/src/include/kompute/Core.hpp          |  30 -
 kompute/src/include/kompute/Kompute.hpp       |  22 -
 kompute/src/include/kompute/Manager.hpp       | 284 ------
 kompute/src/include/kompute/Sequence.hpp      | 304 -------
 kompute/src/include/kompute/Tensor.hpp        | 302 -------
 kompute/src/include/kompute/logger/Logger.hpp | 197 -----
 .../kompute/operations/OpAlgoDispatch.hpp     |  86 --
 .../src/include/kompute/operations/OpBase.hpp |  62 --
 .../kompute/operations/OpBufferSyncDevice.hpp |  50 --
 .../kompute/operations/OpBufferSyncLocal.hpp  |  50 --
 .../kompute/operations/OpMemoryBarrier.hpp    |  81 --
 .../src/include/kompute/operations/OpMult.hpp |  58 --
 .../kompute/operations/OpTensorCopy.hpp       |  63 --
 .../kompute/operations/OpTensorFill.hpp       |  58 --
 .../kompute/operations/OpTensorSyncDevice.hpp |  66 --
 .../kompute/operations/OpTensorSyncLocal.hpp  |  66 --
 kompute/src/logger/CMakeLists.txt             |  69 --
 kompute/src/logger/Logger.cpp                 | 101 ---
 kompute/src/shaders/CMakeLists.txt            |   5 -
 kompute/src/shaders/glsl/CMakeLists.txt       |  26 -
 .../glsl/ShaderLogisticRegression.comp        |  52 --
 .../glsl/ShaderLogisticRegression.hpp.in      | 310 -------
 kompute/src/shaders/glsl/ShaderOpMult.comp    |  28 -
 kompute/src/shaders/glsl/ShaderOpMult.hpp.in  | 101 ---
 kompute/src/shaders/hlsl/computeheadless.comp |  29 -
 66 files changed, 4 insertions(+), 8029 deletions(-)
 create mode 160000 kompute
 delete mode 100644 kompute/.ccls
 delete mode 100644 kompute/.clang-format
 delete mode 100644 kompute/.dockerignore
 delete mode 100644 kompute/.github/workflows/cpp_examples.yml
 delete mode 100644 kompute/.github/workflows/cpp_tests.yml
 delete mode 100644 kompute/.github/workflows/python_tests.yml
 delete mode 100644 kompute/CMakeLists.txt
 delete mode 100644 kompute/LICENSE
 delete mode 100644 kompute/Makefile
 delete mode 100644 kompute/README.md
 delete mode 100644 kompute/cmake/bin2h.cmake
 delete mode 100644 kompute/cmake/bin_file_to_header.cmake
 delete mode 100644 kompute/cmake/check_vulkan_version.cmake
 delete mode 100644 kompute/cmake/code_coverage.cmake
 delete mode 100644 kompute/cmake/deprecation_warnings.cmake
 delete mode 100644 kompute/cmake/komputeConfig.cmake.in
 delete mode 100644 kompute/cmake/vulkan_shader_compiler.cmake
 delete mode 100644 kompute/config/FindSphinx.cmake
 delete mode 100644 kompute/external/bin/xxd.c
 delete mode 100644 kompute/kompute-config.cmake
 delete mode 100755 kompute/scripts/convert_shaders.py
 delete mode 100644 kompute/scripts/requirements.txt
 delete mode 100644 kompute/setup.py
 delete mode 100644 kompute/src/Algorithm.cpp
 delete mode 100644 kompute/src/CMakeLists.txt
 delete mode 100644 kompute/src/Core.cpp
 delete mode 100644 kompute/src/Manager.cpp
 delete mode 100644 kompute/src/OpAlgoDispatch.cpp
 delete mode 100644 kompute/src/OpBufferSyncDevice.cpp
 delete mode 100644 kompute/src/OpBufferSyncLocal.cpp
 delete mode 100644 kompute/src/OpMemoryBarrier.cpp
 delete mode 100644 kompute/src/OpTensorCopy.cpp
 delete mode 100644 kompute/src/OpTensorFill.cpp
 delete mode 100644 kompute/src/OpTensorSyncDevice.cpp
 delete mode 100644 kompute/src/OpTensorSyncLocal.cpp
 delete mode 100644 kompute/src/Sequence.cpp
 delete mode 100644 kompute/src/Tensor.cpp
 delete mode 100644 kompute/src/include/CMakeLists.txt
 delete mode 100644 kompute/src/include/kompute/Algorithm.hpp
 delete mode 100644 kompute/src/include/kompute/Core.hpp
 delete mode 100644 kompute/src/include/kompute/Kompute.hpp
 delete mode 100644 kompute/src/include/kompute/Manager.hpp
 delete mode 100644 kompute/src/include/kompute/Sequence.hpp
 delete mode 100644 kompute/src/include/kompute/Tensor.hpp
 delete mode 100644 kompute/src/include/kompute/logger/Logger.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBase.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpMult.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorCopy.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorFill.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
 delete mode 100644 kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
 delete mode 100644 kompute/src/logger/CMakeLists.txt
 delete mode 100644 kompute/src/logger/Logger.cpp
 delete mode 100644 kompute/src/shaders/CMakeLists.txt
 delete mode 100644 kompute/src/shaders/glsl/CMakeLists.txt
 delete mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.comp
 delete mode 100644 kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
 delete mode 100644 kompute/src/shaders/glsl/ShaderOpMult.comp
 delete mode 100644 kompute/src/shaders/glsl/ShaderOpMult.hpp.in
 delete mode 100644 kompute/src/shaders/hlsl/computeheadless.comp

diff --git a/.gitmodules b/.gitmodules
index e69de29bb..b7e8b8ff2 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kompute"]
+	path = kompute
+	url = https://github.com/nomic-ai/kompute.git
diff --git a/kompute b/kompute
new file mode 160000
index 000000000..2d0a8abc6
--- /dev/null
+++ b/kompute
@@ -0,0 +1 @@
+Subproject commit 2d0a8abc64e90a0956390aa3f1854cb6d48141db
diff --git a/kompute/.ccls b/kompute/.ccls
deleted file mode 100644
index 71d5d711e..000000000
--- a/kompute/.ccls
+++ /dev/null
@@ -1,27 +0,0 @@
-
-%clang
-
--fdeclspec
--fms-extensions
--Wall
--Wextra
--std=c++17
-
-%h -x
-%h c++-header
-
--DDEBUG=1
--DKOMPUTE_INCLUDE_FOR_SYNTAX
-
--I/usr/include/python3.6/
--I./python/pybind11/include/
-
--I./build/_deps/vulkan_header-src/include/
--I./build/_deps/spdlog-src/include/
--I./build/_deps/googletest-src/googletest/include/
--I./build/_deps/fmt-src/include/
-
--I./src/include/
--I./build/src/shaders/glsl/
--I./build/test/shaders/glsl/
--I./test/utils/
diff --git a/kompute/.clang-format b/kompute/.clang-format
deleted file mode 100644
index 5191313a3..000000000
--- a/kompute/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
-﻿---
-BasedOnStyle: Mozilla
-IndentWidth: 4
-
-...
diff --git a/kompute/.dockerignore b/kompute/.dockerignore
deleted file mode 100644
index 9498d9195..000000000
--- a/kompute/.dockerignore
+++ /dev/null
@@ -1,4 +0,0 @@
-build/*
-examples/*
-docker-builders/
-swiftshader/
diff --git a/kompute/.github/workflows/cpp_examples.yml b/kompute/.github/workflows/cpp_examples.yml
deleted file mode 100644
index ad5306e9b..000000000
--- a/kompute/.github/workflows/cpp_examples.yml
+++ /dev/null
@@ -1,58 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  array-multiplication-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/array_multiplication/build
-        source-dir: ${{github.workspace}}/examples/array_multiplication
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/array_multiplication/build/src/kompute_array_mult
-
-  logistc-regression-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/logistic_regression/build
-        source-dir: ${{github.workspace}}/examples/logistic_regression
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
diff --git a/kompute/.github/workflows/cpp_tests.yml b/kompute/.github/workflows/cpp_tests.yml
deleted file mode 100644
index 53a90a145..000000000
--- a/kompute/.github/workflows/cpp_tests.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  cpp-tests-debug-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-release-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-debug-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-  
-  cpp-tests-release-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
diff --git a/kompute/.github/workflows/python_tests.yml b/kompute/.github/workflows/python_tests.yml
deleted file mode 100644
index 9f84d1e85..000000000
--- a/kompute/.github/workflows/python_tests.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-name: Python Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  python-tests:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: Install Python Requirements
-      run: pip3 install --user -r python/test/requirements-dev.txt
-    - name: Python Build
-      env:
-        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
-        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
-      run: pip3 install --user . -v
-    - name: Python run Tests
-      run: |
-        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
-        make test_python
diff --git a/kompute/CMakeLists.txt b/kompute/CMakeLists.txt
deleted file mode 100644
index 1bd84d7ed..000000000
--- a/kompute/CMakeLists.txt
+++ /dev/null
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.20)
-project(kompute VERSION 0.8.1 LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 14)
-
-# Only change the folder behavior if kompute is not a subproject
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
-    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
-    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
-endif()
-
-# Avoid the dll boilerplate code for windows
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
-
-set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
-
-# ####################################################
-# Options
-# ####################################################
-macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow overriding the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    # Allow disabling logging completely and prevent linking against it:
-    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
-        set(${OPTION_NAME}_DISABLED ON)
-        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-message(STATUS "General purpose GPU compute framework built on Vulkan")
-message(STATUS "=======================================================")
-
-# Build options
-kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
-kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
-kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
-kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
-kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
-
-# External components
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
-kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
-kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
-message(STATUS "=======================================================")
-
-# ####################################################
-# Deprecated Options
-# ####################################################
-include(cmake/deprecation_warnings.cmake)
-
-# ####################################################
-# Dependencies
-# ####################################################
-include(cmake/vulkan_shader_compiler.cmake)
-include(cmake/check_vulkan_version.cmake)
-include(FetchContent)
-
-# Vulkan Header
-if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
-    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
-        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
-    FetchContent_MakeAvailable(vulkan_header)
-
-    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-        # Ensure the driver supports this Vulkan version
-        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
-    endif()
-endif()
-
-find_package(Vulkan REQUIRED)
-
-if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
-    add_library(Vulkan::Headers INTERFACE IMPORTED)
-    set_target_properties(Vulkan::Headers PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
-endif()
-
-if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-    # Ensure the driver supports this Vulkan version
-    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
-endif()
-
-# Spdlog
-if(KOMPUTE_OPT_USE_SPDLOG)
-    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
-
-    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
-        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
-            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
-
-            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
-                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
-            FetchContent_MakeAvailable(spdlog)
-        else()
-            find_package(spdlog REQUIRED)
-        endif()
-    endif()
-endif()
-
-# fmt
-if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
-    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
-    FetchContent_MakeAvailable(fmt)
-else()
-    find_package(fmt REQUIRED)
-endif()
-
-add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-# ####################################################
-# Preprocessor Macros
-# ####################################################
-if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
-    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
-endif()
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror -Wno-error=array-bounds")
-endif()
-
-# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
-# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    add_definitions(-DUSE_EXTERNAL_GLSLANG)
-endif()
-
-# Allow scripts to call main kompute Makefile
-function(kompute_make KOMPUTE_MAKE_TARGET)
-    add_custom_target(${KOMPUTE_MAKE_TARGET}
-        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
-endfunction()
-
-add_executable(xxd external/bin/xxd.c)
-
-add_subdirectory(src)
diff --git a/kompute/LICENSE b/kompute/LICENSE
deleted file mode 100644
index 821a2723e..000000000
--- a/kompute/LICENSE
+++ /dev/null
@@ -1,203 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2021 The Institute for Ethical AI & Machine Learning
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
diff --git a/kompute/Makefile b/kompute/Makefile
deleted file mode 100644
index 62ad68b46..000000000
--- a/kompute/Makefile
+++ /dev/null
@@ -1,210 +0,0 @@
-# This makefile is optimized to be run from WSL and to interact with the 
-# Windows host as there are limitations when building GPU programs. This
-# makefile contains the commands for interacting with the visual studio
-# build via command line for faster iterations, as the intention is to 
-# support other editors (optimised for vim). There are also commands that
-# support the builds for linux-native compilations and these are the commands
-# starting with mk_.
-
-VERSION := $(shell cat ./VERSION)
-
-VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
-VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
-
-# These are the tests that don't work with swiftshader but can be run directly with vulkan
-FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
-
-ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
-	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
-	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
-	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
-else
-	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
-	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
-	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
-	# Choosing the binary based on whether it's on WSL or linux-native
-	KERNEL := $(shell uname -r)
-	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
-	ifeq ($(IS_WSL),0)
-		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
-	else
-		SCMP_BIN ?= "/usr/bin/glslangValidator"
-	endif
-endif
-
-
-####### Main Target Rules #######
-
-push_docs_to_ghpages:
-	GIT_DEPLOY_DIR="build/docs/sphinx/" \
-		GIT_DEPLOY_BRANCH="gh-pages" \
-		GIT_DEPLOY_REPO="origin" \
-			./scripts/push_folder_to_branch.sh
-
-####### CMAKE quickstart commands #######
-
-clean_cmake:
-	rm -rf build/
-
-####### Visual studio build shortcut commands #######
-
-MK_BUILD_TYPE ?= "Release"
-MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-MK_CMAKE_EXTRA_FLAGS ?= ""
-MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-
-mk_cmake:
-	cmake \
-		-Bbuild \
-		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_DOCS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
-		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
-		$(MK_CMAKE_EXTRA_FLAGS) \
-		-G "Unix Makefiles"
-
-mk_build_all:
-	cmake --build build/. --parallel
-
-mk_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-mk_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-mk_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-mk_run_docs: mk_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-# An alternative would be: ctest -vv --test-dir build/.
-# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
-# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
-mk_run_tests: mk_build_tests
-	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
-
-mk_build_swiftshader_library:
-	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
-	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
-	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
-	cmake --build swiftshader/build/. --parallel
-
-mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
-mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
-
-
-####### Visual studio build shortcut commands #######
-
-VS_BUILD_TYPE ?= "Debug"
-# Run with multiprocessin / parallel build by default
-VS_CMAKE_EXTRA_FLAGS ?= ""
-VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-
-vs_cmake:
-	$(CMAKE_BIN) \
-		-Bbuild \
-		$(VS_CMAKE_EXTRA_FLAGS) \
-		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
-		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
-		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
-		-G "Visual Studio 16 2019" \
-		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
-
-vs_build_all:
-	cmake --build build/. --parallel
-
-vs_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-vs_install_kompute:
-	cmake --build build/. --target install --parallel
-
-vs_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-vs_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-vs_run_docs: vs_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-vs_run_tests: vs_build_tests
-	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
-
-
-#### PYTHONG ####
-
-test_python:
-	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
-
-####### Run CI Commands #######
-
-# This command uses act to replicate github action
-# https://github.com/nektos/act
-run_ci:
-	act
-
-####### General project commands #######
-
-generate_python_docstrings:
-	python -m pybind11_mkdoc \
-		-o python/src/docstrings.hpp \
-		kompute/Kompute.hpp \
-		-Iexternal/fmt/include/ \
-		-Iexternal/spdlog/include/ \
-		-Iexternal/glslang/ \
-		-I/usr/include/c++/7.5.0/
-
-install_python_reqs:
-	python3 -m pip install -r scripts/requirements.txt
-
-install_lcov:
-	sudo apt install lcov -y
-
-build_shaders:
-	python3 scripts/convert_shaders.py \
-		--shader-path shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path src/include/kompute/shaders/ \
-		-v
-	python3 scripts/convert_shaders.py \
-		--shader-path test/shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
-		-v
-
-build_single_header:
-	quom \
-		--include_directory \
-		"src/include/" \
-		"single_include/AggregateHeaders.cpp" \
-		"single_include/kompute/Kompute.hpp"
-
-win_build_xxd:
-	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
-
-format:
-	for val in "examples single_include src test" ; do \
-    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
-	done
-
-static_scan:
-	cppcheck --project=build/compile_commands.json -iexternal/
-
-build_changelog:
-	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
-	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
-	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
diff --git a/kompute/README.md b/kompute/README.md
deleted file mode 100644
index b169da254..000000000
--- a/kompute/README.md
+++ /dev/null
@@ -1,513 +0,0 @@
-
-![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
-![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
-![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
-![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
-![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
-[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
-
-<table>
-<tr>
-
-<td width="20%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
-</td>
-
-<td>
-
-<h1>Kompute</h1>
-<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
-
-</td>
-
-</tr>
-</table>
-
-<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
-
-💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
-
-<hr>
-
-##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
-
-<table>
-<tr>
-<td>
-<a href="https://www.linuxfoundation.org/projects/">
-<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
-</a>
-</td>
-<td>
-<a href="https://lfaidata.foundation/projects/">
-<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
-</a>
-</td>
-</tr>
-</table>
-
-
-## Principles & Features
-
-* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
-* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
-* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
-* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
-* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
-* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
-* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
-
-## Getting Started
-
-Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
-
-You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
-
-### Your First Kompute (C++)
-
-The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
-
-```c++
-
-void kompute(const std::string& shader) {
-
-    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    kp::Manager mgr; 
-
-    // 2. Create and initialise Kompute Tensors through manager
-
-    // Default tensor constructor simplifies creation of float values
-    auto tensorInA = mgr.tensor({ 2., 2., 2. });
-    auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    // Explicit type constructor supports uint32, int32, double, float and bool
-    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-
-    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
-
-    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    kp::Workgroup workgroup({3, 1, 1});
-    std::vector<float> specConsts({ 2 });
-    std::vector<float> pushConstsA({ 2.0 });
-    std::vector<float> pushConstsB({ 3.0 });
-
-    auto algorithm = mgr.algorithm(params,
-                                   // See documentation shader section for compileSource
-                                   compileSource(shader),
-                                   workgroup,
-                                   specConsts,
-                                   pushConstsA);
-
-    // 4. Run operation synchronously using sequence
-    mgr.sequence()
-        ->record<kp::OpTensorSyncDevice>(params)
-        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
-        ->eval() // Evaluates the two recorded operations
-        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
-        ->eval(); // Evaluates only last recorded operation
-
-    // 5. Sync results from the GPU asynchronously
-    auto sq = mgr.sequence();
-    sq->evalAsync<kp::OpTensorSyncLocal>(params);
-
-    // ... Do other work asynchronously whilst GPU finishes
-
-    sq->evalAwait();
-
-    // Prints the first output which is: { 4, 8, 12 }
-    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
-    // Prints the second output which is: { 10, 10, 10 }
-    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
-
-} // Manages / releases all CPU and GPU memory resources
-
-int main() {
-
-    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    // files). This shader shows some of the main components including constants, buffers, etc
-    std::string shader = (R"(
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    )");
-
-    // Run the function declared above with our raw string shader
-    kompute(shader);
-}
-
-```
-
-### Your First Kompute (Python)
-
-The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
-
-```python
-
-from .utils import compile_source # using util function from python/test/utils
-
-def kompute(shader):
-    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    mgr = kp.Manager()
-
-    # 2. Create and initialise Kompute Tensors through manager
-
-    # Default tensor constructor simplifies creation of float values
-    tensor_in_a = mgr.tensor([2, 2, 2])
-    tensor_in_b = mgr.tensor([1, 2, 3])
-    # Explicit type constructor supports uint32, int32, double, float and bool
-    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-
-    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
-
-    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    workgroup = (3, 1, 1)
-    spec_consts = [2]
-    push_consts_a = [2]
-    push_consts_b = [3]
-
-    # See documentation shader section for compile_source
-    spirv = compile_source(shader)
-
-    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
-
-    # 4. Run operation synchronously using sequence
-    (mgr.sequence()
-        .record(kp.OpTensorSyncDevice(params))
-        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
-        .eval() # evaluates the two recorded ops
-        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
-        .eval()) # evaluates only the last recorded op
-
-    # 5. Sync results from the GPU asynchronously
-    sq = mgr.sequence()
-    sq.eval_async(kp.OpTensorSyncLocal(params))
-
-    # ... Do other work asynchronously whilst GPU finishes
-
-    sq.eval_await()
-
-    # Prints the first output which is: { 4, 8, 12 }
-    print(tensor_out_a)
-    # Prints the first output which is: { 10, 10, 10 }
-    print(tensor_out_b)
-
-if __name__ == "__main__":
-
-    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    # files). This shader shows some of the main components including constants, buffers, etc
-    shader = """
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    """
-
-    kompute(shader)
-
-```
-
-### Interactive Notebooks & Hands on Videos
-
-You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
-</td>
-
-<td>
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
-</a>
-</td>
-
-<td>
-<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-You can also check out the two following talks presented at the FOSDEM 2021 conference. 
-
-Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
-</td>
-
-<td>
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
-</a>
-</td>
-
-<td>
-<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-## Architectural Overview
-
-The core architecture of Kompute includes the following:
-* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
-* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
-* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
-* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
-* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
-
-To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
-
-<table>
-<th>
-Full Architecture
-</th>
-<th>
-Simplified Kompute Components
-</th>
-<tr>
-<td width=30%>
-
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
-
-<br>
-<br>
-(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
-<br>
-<br>
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
-
-</td>
-<td>
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
-</td>
-</tr>
-</table>
-
-
-## Asynchronous and Parallel Operations
-
-Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
-
-The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
-
-## Mobile Enabled
-
-Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
-
-<table>
-<tr>
-
-<td width="70%">
-<p>
-For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
-
-You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
-
-</p>
-
-
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
-
-</td>
-
-
-<td width="30%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
-</td>
-
-</tr>
-</table>
-
-## More examples
-
-### Simple examples
-
-* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
-* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
-* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
-* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
-* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
-* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
-
-### End-to-end examples
-
-* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
-* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
-* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
-* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
-
-## Python Package
-
-Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
-
-The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
-
-```
-pip install kp
-```
-
-You can also install from master branch using:
-
-```
-pip install git+git://github.com/KomputeProject/kompute.git@master
-```
-
-For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
-
-## C++ Build Overview
-
-The build system provided uses `cmake`, which allows for cross platform builds.
-
-The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
-
-```
-   cmake -Bbuild
-```
-
-You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
-
-For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
-
-## Kompute Development
-
-We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
-
-### Contributing
-
-#### Dev Dependencies
-
-* Testing
-    + GTest
-* Documentation
-    + Doxygen (with Dot)
-    + Sphynx
-
-#### Development
-
-* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
-    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
-    + All dependencies are defined in vcpkg.json 
-* Uses cmake as build system, and provides a top level makefile with recommended command
-* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
-* Uses doxygen and sphinx for documentation and autodocs
-* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
-
-If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
-
-```
-export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
-```
-
-##### Updating documentation
-
-To update the documentation you will need to:
-* Run the gendoxygen target in the build system
-* Run the gensphynx target in the build-system 
-* Push to github pages with `make push_docs_to_ghpages`
-
-##### Running tests
-
-Running the unit tests has been significantly simplified for contributors.
-
-The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
-
-```
-$ act
-
-[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-...
-```
-
-The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
-
-The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
-
-In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
-
-For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
-
-## Motivations
-
-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
-
-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
-
-We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
diff --git a/kompute/cmake/bin2h.cmake b/kompute/cmake/bin2h.cmake
deleted file mode 100644
index 21ad56cb1..000000000
--- a/kompute/cmake/bin2h.cmake
+++ /dev/null
@@ -1,106 +0,0 @@
-##################################################################################
-# Based on: https://github.com/sivachandran/cmake-bin2h
-#
-# Copyright 2020 Sivachandran Paramasivam
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-##################################################################################
-
-include(CMakeParseArguments)
-
-# Function to wrap a given string into multiple lines at the given column position.
-# Parameters:
-#   VARIABLE    - The name of the CMake variable holding the string.
-#   AT_COLUMN   - The column position at which string will be wrapped.
-function(WRAP_STRING)
-    set(oneValueArgs VARIABLE AT_COLUMN)
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
-    math(EXPR offset "0")
-
-    while(stringLength GREATER 0)
-
-        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
-            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
-        else()
-            math(EXPR length "${stringLength}")
-        endif()
-
-        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
-        set(lines "${lines}\n${line}")
-
-        math(EXPR stringLength "${stringLength} - ${length}")
-        math(EXPR offset "${offset} + ${length}")
-    endwhile()
-
-    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
-endfunction()
-
-# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
-# will contain a byte array and integer variable holding the size of the array.
-# Parameters
-#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
-#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
-#                      to this name and will be used a variable name for size variable.
-#   HEADER_FILE      - The path of header file.
-#   APPEND           - If specified appends to the header file instead of overwriting it
-#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
-#                      useful if the source file is a text file and we want to use the file contents
-#                      as string. But the size variable holds size of the byte array without this
-#                      null byte.
-#   HEADER_NAMESPACE - The namespace, where the array should be located in.
-#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
-#                      big endian system architecture
-# Usage:
-#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
-function(BIN2H)
-    set(options APPEND NULL_TERMINATE)
-    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
-    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    # reads source file contents as hex string
-    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
-    string(LENGTH ${hexString} hexStringLength)
-
-    # appends null byte if asked
-    if(BIN2H_NULL_TERMINATE)
-        set(hexString "${hexString}00")
-    endif()
-
-    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
-    wrap_string(VARIABLE hexString AT_COLUMN 32)
-    math(EXPR arraySize "${hexStringLength} / 8")
-
-    # adds '0x' prefix and comma suffix before and after every byte respectively
-    if(IS_BIG_ENDIAN)
-        message(STATUS "Interpreting shader in big endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
-    else()
-        message(STATUS "Interpreting shader in little endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
-    endif()
-    # removes trailing comma
-    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
-
-    # converts the variable name into proper C identifier
-    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-
-    # declares byte array and the length variables
-    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
-    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
-    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
-    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
-
-    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
-    if(BIN2H_APPEND)
-        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
-    else()
-        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
-    endif()
-endfunction()
\ No newline at end of file
diff --git a/kompute/cmake/bin_file_to_header.cmake b/kompute/cmake/bin_file_to_header.cmake
deleted file mode 100644
index b47b36139..000000000
--- a/kompute/cmake/bin_file_to_header.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-if(${INPUT_SHADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
-endif()
-
-if(${OUTPUT_HEADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
-endif()
-
-if(${HEADER_NAMESPACE} STREQUAL "")
-    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
-endif()
-
-include(bin2h.cmake)
-
-get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
-bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
-file(APPEND ${OUTPUT_HEADER_FILE} "\n")
\ No newline at end of file
diff --git a/kompute/cmake/check_vulkan_version.cmake b/kompute/cmake/check_vulkan_version.cmake
deleted file mode 100644
index 0372d3206..000000000
--- a/kompute/cmake/check_vulkan_version.cmake
+++ /dev/null
@@ -1,139 +0,0 @@
-# Current issue: Only checks the result of GPU0
-function(check_vulkan_version)
-    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
-    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
-
-    # Get the current Vulkan Header version (e.g. 1.2.189).
-    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
-    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
-        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
-        if(EXISTS ${VULKAN_CORE_H})
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
-            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
-            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
-                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
-                list(LENGTH VULKAN_HEADER_VERSION2 _len)
-                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
-                if(_len EQUAL 3)
-                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
-                endif()
-                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
-                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
-            else()
-                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
-                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
-                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
-                else()
-                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
-                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
-                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
-                    else()
-                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
-                    endif()
-                endif()
-            endif()
-        else()
-            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-            return()
-        endif()
-    else()
-        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-        return()
-    endif()
-    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
-
-    # Get Vulkan version supported by driver
-    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
-    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
-        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-        return()
-    endif()
-
-    execute_process(COMMAND "vulkaninfo"
-                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
-                    RESULT_VARIABLE VULKAN_INFO_RETURN)
-    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
-        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
-        return()
-    else()
-        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
-    endif()
-
-    # Check if running vulkaninfo was successfully
-    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
-    if(VULKAN_INFO_SUCCESSFUL LESS 0)
-        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
-    endif()
-
-    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
-    if(NOT GPU_IDS)
-        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
-    if(NOT GPU_API_VERSIONS)
-        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    # Check length
-    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
-    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
-    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
-    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
-        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
-    endif()
-
-    # Compare versions
-    set(VALID_GPU "")
-    set(VALID_VULKAN_VERSION "")
-    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
-    foreach(INDEX RANGE ${ITER_LEN})
-        list(GET GPU_IDS ${INDEX} GPU)
-        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
-
-        # Extract API version
-        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
-            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
-        else()
-            message(FATAL_ERROR "API version match failed. This should not have happened...")
-        endif()
-
-        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
-
-        # Compare driver and header version
-        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
-        # Version missmatch. Let us check if the minor version is the same.
-            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-
-            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
-                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
-                set(VALID_GPU ${GPU})
-                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-                break()
-            else()
-                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
-            endif()
-        else()
-            set(VALID_GPU ${GPU})
-            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-            break()
-        endif()
-    endforeach()
-
-    if("${VALID_GPU}" STREQUAL "")
-        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-    else()
-        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
-    endif()
-
-endfunction()
diff --git a/kompute/cmake/code_coverage.cmake b/kompute/cmake/code_coverage.cmake
deleted file mode 100644
index 7fb6ce264..000000000
--- a/kompute/cmake/code_coverage.cmake
+++ /dev/null
@@ -1,35 +0,0 @@
-# Code coverage
-set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
-message(STATUS "Enabling gcov support")
-
-if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(COVERAGE_FLAG "--coverage")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C++ compiler during coverage builds."
-    FORCE)
-set(CMAKE_C_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C compiler during coverage builds."
-    FORCE)
-set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used for linking binaries during coverage builds."
-    FORCE)
-set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
-    FORCE)
-
-set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
-set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
-set(CODECOV_FILENAME_LCOV_INFO lcov.info)
-set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
-set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
-
-mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_C_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
diff --git a/kompute/cmake/deprecation_warnings.cmake b/kompute/cmake/deprecation_warnings.cmake
deleted file mode 100644
index 1ed1f4555..000000000
--- a/kompute/cmake/deprecation_warnings.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
-endif()
-
-if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
-endif()
\ No newline at end of file
diff --git a/kompute/cmake/komputeConfig.cmake.in b/kompute/cmake/komputeConfig.cmake.in
deleted file mode 100644
index 87e8a99e2..000000000
--- a/kompute/cmake/komputeConfig.cmake.in
+++ /dev/null
@@ -1,8 +0,0 @@
-include(CMakeFindDependencyMacro)
-@PACKAGE_INIT@
-
-find_dependency(VULKAN REQUIRED)
-
-include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
-
-check_required_components(kompute)
\ No newline at end of file
diff --git a/kompute/cmake/vulkan_shader_compiler.cmake b/kompute/cmake/vulkan_shader_compiler.cmake
deleted file mode 100644
index acc27b57c..000000000
--- a/kompute/cmake/vulkan_shader_compiler.cmake
+++ /dev/null
@@ -1,43 +0,0 @@
-function(vulkan_compile_shader)
-     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
-     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
-          message(FATAL_ERROR "glslangValidator not found.")
-          return()
-     endif()
-
-     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
-     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
-     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
-     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
-
-     if(NOT SHADER_COMPILE_RELATIVE_PATH)
-          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-     endif()
-    
-     # .comp -> .spv
-     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
-                        ARGS "-V"
-                             "${SHADER_COMPILE_INFILE_FULL}"
-                             "-o"
-                             "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
-
-     # Check if big or little endian
-     include (TestBigEndian)
-     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
-
-     # .spv -> .hpp
-     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
-                        COMMAND ${CMAKE_COMMAND}
-                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
-                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
-                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
-                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
-                             "-P"
-                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
-                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
-                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
-endfunction()
diff --git a/kompute/config/FindSphinx.cmake b/kompute/config/FindSphinx.cmake
deleted file mode 100644
index c645ccc9f..000000000
--- a/kompute/config/FindSphinx.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# Look for an executable called sphinx-build
-find_program(SPHINX_EXECUTABLE
-    NAMES sphinx-build
-    DOC "Path to sphinx-build executable")
-
-if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
-    message(FATAL_ERROR "sphinx-build not found.")
-endif()
-
-include(FindPackageHandleStandardArgs)
-
-# Handle standard arguments to find_package like REQUIRED and QUIET
-find_package_handle_standard_args(
-    Sphinx
-    "Failed to find sphinx-build executable"
-    SPHINX_EXECUTABLE)
diff --git a/kompute/external/bin/xxd.c b/kompute/external/bin/xxd.c
deleted file mode 100644
index 60ed3f712..000000000
--- a/kompute/external/bin/xxd.c
+++ /dev/null
@@ -1,819 +0,0 @@
-/*
-As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
-the author has permitted redistribution of xxd under the MIT license, as follows:
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * xxd: my hexdump facility. jw
- *
- *  2.10.90 changed to word output
- *  3.03.93 new indent style, dumb bug inserted and fixed.
- *	    -c option, mls
- * 26.04.94 better option parser, -ps, -l, -s added.
- *  1.07.94 -r badly needs - as input file.  Per default autoskip over
- *	       consecutive lines of zeroes, as unix od does.
- *	    -a shows them too.
- *	    -i dump as c-style #include "file.h"
- *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
- *	    array is written in correct c-syntax.
- *	    -s improved, now defaults to absolute seek, relative requires a '+'.
- *	    -r improved, now -r -s -0x... is supported.
- *	       change/suppress leading '\0' bytes.
- *	    -l n improved: stops exactly after n bytes.
- *	    -r improved, better handling of partial lines with trailing garbage.
- *	    -r improved, now -r -p works again!
- *	    -r improved, less flushing, much faster now! (that was silly)
- *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
- * 15.05.96 -v added. They want to know the version.
- *	    -a fixed, to show last line inf file ends in all zeros.
- *	    -u added: Print upper case hex-letters, as preferred by unix bc.
- *	    -h added to usage message. Usage message extended.
- *	    Now using outfile if specified even in normal mode, aehem.
- *	    No longer mixing of ints and longs. May help doze people.
- *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
- * 16.05.96 -p improved, removed occasional superfluous linefeed.
- * 20.05.96 -l 0 fixed. tried to read anyway.
- * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
- *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
- *	    support --gnuish-longhorn-options
- * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
- *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
- *	    xxdline().
- *  7.06.96 -i printed 'int' instead of 'char'. *blush*
- *	    added Bram's OS2 ifdefs...
- * 18.07.96 gcc -Wall @ SunOS4 is now slient.
- *	    Added osver for MSDOS/DJGPP/WIN32.
- * 29.08.96 Added size_t to strncmp() for Amiga.
- * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
- * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
- *	    (azc10@yahoo.com)
- * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
- * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
- *	    missing or wrong.
- * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
- * 27.10.98 Fixed: -g option parser required blank.
- *	    option -b added: 01000101 binary output in normal format.
- * 16.05.00 Added VAXC changes by Stephen P. Wall
- * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
- *
- * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
- *
- * Small changes made afterwards by Bram Moolenaar et al.
- *
- * Distribute freely and credit me,
- * make money and share with me,
- * lose money and don't ask me.
- *
- *
- */
-
-/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
-#if _MSC_VER >= 1400
-# define _CRT_SECURE_NO_DEPRECATE
-# define _CRT_NONSTDC_NO_DEPRECATE
-#endif
-
-#include <stdio.h>
-#ifdef VAXC
-# include <file.h>
-#else
-# include <fcntl.h>
-#endif
-#ifdef __TSC__
-# define MSDOS
-#endif
-#if !defined(OS2) && defined(__EMX__)
-# define OS2
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
-# include <io.h>	/* for setmode() */
-#else
-# ifdef UNIX
-#  include <unistd.h>
-# endif
-#endif
-#include <stdlib.h>
-#include <string.h>	/* for strncmp() */
-#include <ctype.h>	/* for isalnum() */
-#if __MWERKS__ && !defined(BEBOX)
-# include <unix.h>	/* for fdopen() on MAC */
-#endif
-
-#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
-/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
-# define fileno(f)       ((f)->fd)
-FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
-#endif
-
-
-/*  This corrects the problem of missing prototypes for certain functions
- *  in some GNU installations (e.g. SunOS 4.1.x).
- *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
- */
-#if defined(__GNUC__) && defined(__STDC__)
-# ifndef __USE_FIXED_PROTOTYPES__
-#  define __USE_FIXED_PROTOTYPES__
-# endif
-#endif
-
-#ifndef __USE_FIXED_PROTOTYPES__
-/*
- * This is historic and works only if the compiler really has no prototypes:
- *
- * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
- * FILE is defined on OS 4.x, not on 5.x (Solaris).
- * if __SVR4 is defined (some Solaris versions), don't include this.
- */
-#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
-#  define __P(a) a
-/* excerpt from my sun_stdlib.h */
-extern int fprintf __P((FILE *, char *, ...));
-extern int fputs   __P((char *, FILE *));
-extern int _flsbuf __P((unsigned char, FILE *));
-extern int _filbuf __P((FILE *));
-extern int fflush  __P((FILE *));
-extern int fclose  __P((FILE *));
-extern int fseek   __P((FILE *, long, int));
-extern int rewind  __P((FILE *));
-
-extern void perror __P((char *));
-# endif
-#endif
-
-extern long int strtol();
-extern long int ftell();
-
-char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
-#ifdef WIN32
-char osver[] = " (Win32)";
-#else
-# ifdef DJGPP
-char osver[] = " (dos 32 bit)";
-# else
-#  ifdef MSDOS
-char osver[] = " (dos 16 bit)";
-#  else
-char osver[] = "";
-#  endif
-# endif
-#endif
-
-#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
-# define CYGWIN
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
-# define PATH_SEP '\\'
-#elif defined(CYGWIN)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
-# define PATH_SEP '/'
-#else
-# ifdef VMS
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP ']'
-#  define FILE_SEP '.'
-# else
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP '/'
-# endif
-#endif
-
-/* open has only to arguments on the Mac */
-#if __MWERKS__
-# define OPEN(name, mode, umask) open(name, mode)
-#else
-# define OPEN(name, mode, umask) open(name, mode, umask)
-#endif
-
-#ifdef AMIGA
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
-#else
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
-#endif
-
-#ifndef __P
-# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
-        || defined(__BORLANDC__)
-#  define __P(a) a
-# else
-#  define __P(a) ()
-# endif
-#endif
-
-/* Let's collect some prototypes */
-/* CodeWarrior is really picky about missing prototypes */
-static void exit_with_usage __P((char *));
-static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
-static void xxdline __P((FILE *, char *, int));
-
-#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
-#define COLS 256	/* change here, if you ever need more columns */
-#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
-
-char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
-
-/* the different hextypes known by this program: */
-#define HEX_NORMAL 0
-#define HEX_POSTSCRIPT 1
-#define HEX_CINCLUDE 2
-#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
-
-static void
-exit_with_usage(pname)
-char *pname;
-{
-  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
-  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
-  fprintf(stderr, "Options:\n");
-  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
-  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
-  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
-  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
-  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
-  fprintf(stderr, "    -h          print this summary.\n");
-  fprintf(stderr, "    -i          output in C include file style.\n");
-  fprintf(stderr, "    -l len      stop after <len> octets.\n");
-  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
-  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
-  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
-  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
-#ifdef TRY_SEEK
-      "[+][-]", "(or +: rel.) ");
-#else
-      "", "");
-#endif
-  fprintf(stderr, "    -u          use upper case hex letters.\n");
-  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
-  exit(1);
-}
-
-/*
- * Max. cols binary characters are decoded from the input stream per line.
- * Two adjacent garbage characters after evaluated data delimit valid data.
- * Everything up to the next newline is discarded.
- *
- * The name is historic and came from 'undo type opt h'.
- */
-static int
-huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
-FILE *fpi, *fpo, *fperr;
-char *pname;
-int cols, hextype;
-long base_off;
-{
-  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
-  long have_off = 0, want_off = 0;
-
-  rewind(fpi);
-
-  while ((c = getc(fpi)) != EOF)
-    {
-      if (c == '\r')	/* Doze style input file? */
-    continue;
-
-#if 0	/* this doesn't work when there is normal text after the hex codes in
-       the last line that looks like hex */
-      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
-    continue;
-#endif
-
-      n3 = n2;
-      n2 = n1;
-
-      if (c >= '0' && c <= '9')
-    n1 = c - '0';
-      else if (c >= 'a' && c <= 'f')
-    n1 = c - 'a' + 10;
-      else if (c >= 'A' && c <= 'F')
-    n1 = c - 'A' + 10;
-      else
-    {
-      n1 = -1;
-      if (ign_garb)
-        continue;
-    }
-
-      ign_garb = 0;
-
-      if (p >= cols)
-    {
-      if (!hextype)
-        {
-          if (n1 < 0)
-        {
-          p = 0;
-          continue;
-        }
-          want_off = (want_off << 4) | n1;
-          continue;
-        }
-      else
-        p = 0;
-    }
-
-      if (base_off + want_off != have_off)
-    {
-      fflush(fpo);
-#ifdef TRY_SEEK
-      c = fseek(fpo, base_off + want_off - have_off, 1);
-      if (c >= 0)
-        have_off = base_off + want_off;
-#endif
-      if (base_off + want_off < have_off)
-        {
-          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
-          return 5;
-        }
-      for (; have_off < base_off + want_off; have_off++)
-        putc(0, fpo);
-    }
-
-      if (n2 >= 0 && n1 >= 0)
-    {
-      putc((n2 << 4) | n1, fpo);
-      have_off++;
-      want_off++;
-      n1 = -1;
-      if ((++p >= cols) && !hextype)
-        {
-          /* skip rest of line as garbage */
-          want_off = 0;
-          while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-          ign_garb = 1;
-        }
-    }
-      else if (n1 < 0 && n2 < 0 && n3 < 0)
-    {
-      /* already stumbled into garbage, skip line, wait and see */
-      if (!hextype)
-        want_off = 0;
-      while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-      ign_garb = 1;
-    }
-    }
-  fflush(fpo);
-#ifdef TRY_SEEK
-  fseek(fpo, 0L, 2);
-#endif
-  fclose(fpo);
-  fclose(fpi);
-  return 0;
-}
-
-/*
- * Print line l. If nz is false, xxdline regards the line a line of
- * zeroes. If there are three or more consecutive lines of zeroes,
- * they are replaced by a single '*' character.
- *
- * If the output ends with more than two lines of zeroes, you
- * should call xxdline again with l being the last line and nz
- * negative. This ensures that the last line is shown even when
- * it is all zeroes.
- *
- * If nz is always positive, lines are never suppressed.
- */
-static void
-xxdline(fp, l, nz)
-FILE *fp;
-char *l;
-int nz;
-{
-  static char z[LLEN+1];
-  static int zero_seen = 0;
-
-  if (!nz && zero_seen == 1)
-    strcpy(z, l);
-
-  if (nz || !zero_seen++)
-    {
-      if (nz)
-    {
-      if (nz < 0)
-        zero_seen--;
-      if (zero_seen == 2)
-        fputs(z, fp);
-      if (zero_seen > 2)
-        fputs("*\n", fp);
-    }
-      if (nz >= 0 || zero_seen > 0)
-    fputs(l, fp);
-      if (nz)
-    zero_seen = 0;
-    }
-}
-
-/* This is an EBCDIC to ASCII conversion table */
-/* from a proposed BTL standard April 16, 1979 */
-static unsigned char etoa64[] =
-{
-    0040,0240,0241,0242,0243,0244,0245,0246,
-    0247,0250,0325,0056,0074,0050,0053,0174,
-    0046,0251,0252,0253,0254,0255,0256,0257,
-    0260,0261,0041,0044,0052,0051,0073,0176,
-    0055,0057,0262,0263,0264,0265,0266,0267,
-    0270,0271,0313,0054,0045,0137,0076,0077,
-    0272,0273,0274,0275,0276,0277,0300,0301,
-    0302,0140,0072,0043,0100,0047,0075,0042,
-    0303,0141,0142,0143,0144,0145,0146,0147,
-    0150,0151,0304,0305,0306,0307,0310,0311,
-    0312,0152,0153,0154,0155,0156,0157,0160,
-    0161,0162,0136,0314,0315,0316,0317,0320,
-    0321,0345,0163,0164,0165,0166,0167,0170,
-    0171,0172,0322,0323,0324,0133,0326,0327,
-    0330,0331,0332,0333,0334,0335,0336,0337,
-    0340,0341,0342,0343,0344,0135,0346,0347,
-    0173,0101,0102,0103,0104,0105,0106,0107,
-    0110,0111,0350,0351,0352,0353,0354,0355,
-    0175,0112,0113,0114,0115,0116,0117,0120,
-    0121,0122,0356,0357,0360,0361,0362,0363,
-    0134,0237,0123,0124,0125,0126,0127,0130,
-    0131,0132,0364,0365,0366,0367,0370,0371,
-    0060,0061,0062,0063,0064,0065,0066,0067,
-    0070,0071,0372,0373,0374,0375,0376,0377
-};
-
-const char* extract_filename(const char* path) {
-    const char* filename = strrchr(path, '/');
-    if (filename) {
-        return filename + 1;
-    }
-    return path;
-}
-
-int
-main(argc, argv)
-int argc;
-char *argv[];
-{
-  FILE *fp, *fpo;
-  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
-  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
-  int ebcdic = 0;
-  int octspergrp = -1;	/* number of octets grouped in output */
-  int grplen;		/* total chars per octet group */
-  long length = -1, n = 0, seekoff = 0;
-  char l[LLEN+1];
-  char *pname, *pp;
-
-#ifdef AMIGA
-  /* This program doesn't work when started from the Workbench */
-  if (argc == 0)
-    exit(1);
-#endif
-
-  pname = argv[0];
-  for (pp = pname; *pp; )
-    if (*pp++ == PATH_SEP)
-      pname = pp;
-#ifdef FILE_SEP
-  for (pp = pname; *pp; pp++)
-    if (*pp == FILE_SEP)
-      {
-    *pp = '\0';
-    break;
-      }
-#endif
-
-  while (argc >= 2)
-    {
-      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
-       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
-      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
-      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
-      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
-      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
-      else if (!STRNCMP(pp, "-r", 2)) revert++;
-      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
-      else if (!STRNCMP(pp, "-v", 2))
-    {
-      fprintf(stderr, "%s%s\n", version, osver);
-      exit(0);
-    }
-      else if (!STRNCMP(pp, "-c", 2))
-    {
-      if (pp[2] && STRNCMP("ols", pp + 2, 3))
-        cols = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          cols = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-g", 2))
-    {
-      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
-        octspergrp = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          octspergrp = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-s", 2))
-    {
-      relseek = 0;
-      negseek = 0;
-      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
-        {
-#ifdef TRY_SEEK
-          if (pp[2] == '+')
-        relseek++;
-          if (pp[2+relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
-        }
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-#ifdef TRY_SEEK
-          if (argv[2][0] == '+')
-        relseek++;
-          if (argv[2][relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-l", 2))
-    {
-      if (pp[2] && STRNCMP("en", pp + 2, 2))
-        length = strtol(pp + 2, (char **)NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          length = strtol(argv[2], (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!strcmp(pp, "--"))	/* end of options */
-    {
-      argv++;
-      argc--;
-      break;
-    }
-      else if (pp[0] == '-' && pp[1])	/* unknown option */
-    exit_with_usage(pname);
-      else
-    break;				/* not an option */
-
-      argv++;				/* advance to next argument */
-      argc--;
-    }
-
-  if (!cols)
-    switch (hextype)
-      {
-      case HEX_POSTSCRIPT:	cols = 30; break;
-      case HEX_CINCLUDE:	cols = 12; break;
-      case HEX_BITS:		cols = 6; break;
-      case HEX_NORMAL:
-      default:			cols = 16; break;
-      }
-
-  if (octspergrp < 0)
-    switch (hextype)
-      {
-      case HEX_BITS:		octspergrp = 1; break;
-      case HEX_NORMAL:		octspergrp = 2; break;
-      case HEX_POSTSCRIPT:
-      case HEX_CINCLUDE:
-      default:			octspergrp = 0; break;
-      }
-
-  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
-                                && (cols > COLS)))
-    {
-      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
-      exit(1);
-    }
-
-  if (octspergrp < 1)
-    octspergrp = cols;
-
-  if (argc > 3)
-    exit_with_usage(pname);
-
-  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
-    BIN_ASSIGN(fp = stdin, !revert);
-  else
-    {
-      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
-    {
-      fprintf(stderr,"%s: ", pname);
-      perror(argv[1]);
-      return 2;
-    }
-    }
-
-  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
-    BIN_ASSIGN(fpo = stdout, revert);
-  else
-    {
-      int fd;
-      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
-
-      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
-      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
-    {
-      fprintf(stderr, "%s: ", pname);
-      perror(argv[2]);
-      return 3;
-    }
-      rewind(fpo);
-    }
-
-  if (revert)
-    {
-      if (hextype && (hextype != HEX_POSTSCRIPT))
-    {
-      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
-      return -1;
-    }
-      return huntype(fp, fpo, stderr, pname, cols, hextype,
-        negseek ? -seekoff : seekoff);
-    }
-
-  if (seekoff || negseek || !relseek)
-    {
-#ifdef TRY_SEEK
-      if (relseek)
-    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
-      else
-    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
-      if (e < 0 && negseek)
-    {
-      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
-      return 4;
-    }
-      if (e >= 0)
-    seekoff = ftell(fp);
-      else
-#endif
-    {
-      long s = seekoff;
-
-      while (s--)
-        (void)getc(fp);
-    }
-    }
-
-  if (hextype == HEX_CINCLUDE)
-    {
-      const char* filename = extract_filename(argv[1]);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fputs("[] = {\n", fpo);
-    }
-
-      p = 0;
-      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
-    {
-      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
-        (p % cols) ? ", " : ",\n  "+2*!p,  c);
-      p++;
-    }
-
-      if (p)
-    fputs("\n};\n"+3*(fp == stdin), fpo);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fprintf(fpo, "_len = %d;\n", p);
-    }
-
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  if (hextype == HEX_POSTSCRIPT)
-    {
-      p = cols;
-      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      putchar(hexx[(e >> 4) & 0xf]);
-      putchar(hexx[(e     ) & 0xf]);
-      n++;
-      if (!--p)
-        {
-          putchar('\n');
-          p = cols;
-        }
-    }
-      if (p < cols)
-    putchar('\n');
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  /* hextype: HEX_NORMAL or HEX_BITS */
-
-  if (hextype == HEX_NORMAL)
-    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
-  else	/* hextype == HEX_BITS */
-    grplen = 8 * octspergrp + 1;
-
-  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      if (p == 0)
-    {
-      sprintf(l, "%07lx: ", n + seekoff);
-      for (c = 9; c < LLEN; l[c++] = ' ');
-    }
-      if (hextype == HEX_NORMAL)
-    {
-      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
-      l[++c]			       = hexx[ e       & 0xf];
-    }
-      else /* hextype == HEX_BITS */
-    {
-      int i;
-
-      c = (9 + (grplen * p) / octspergrp) - 1;
-      for (i = 7; i >= 0; i--)
-        l[++c] = (e & (1 << i)) ? '1' : '0';
-    }
-      if (ebcdic)
-    e = (e < 64) ? '.' : etoa64[e-64];
-      /* When changing this update definition of LLEN above. */
-      l[11 + (grplen * cols - 1)/octspergrp + p] =
-#ifdef __MVS__
-      (e >= 64)
-#else
-      (e > 31 && e < 127)
-#endif
-      ? e : '.';
-      if (e)
-    nonzero++;
-      n++;
-      if (++p == cols)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, autoskip ? nonzero : 1);
-      nonzero = 0;
-      p = 0;
-    }
-    }
-  if (p)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, 1);
-    }
-  else if (autoskip)
-    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
-
-  fclose(fp);
-  fclose(fpo);
-  return 0;
-}
diff --git a/kompute/kompute-config.cmake b/kompute/kompute-config.cmake
deleted file mode 100644
index 10425252c..000000000
--- a/kompute/kompute-config.cmake
+++ /dev/null
@@ -1,28 +0,0 @@
-# General purpose GPU compute framework built on Vulkan to
-# support 1000s of cross vendor graphics cards
-# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
-# asynchronous and optimized for advanced GPU data processing use cases.
-# Backed by the Linux Foundation. 
-#
-# Finding this module will define the following variables:
-#  KOMPUTE_FOUND - True if the core library has been found
-#  KOMPUTE_LIBRARIES - Path to the core library archive
-#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
-#                     to kompute.h, as a single include which must be included in every
-#                     file that uses this interface. Else it also points to the
-#                     directory for individual includes.
-
-find_path(KOMPUTE_INCLUDE_DIR
-          NAMES kompute.h)
-
-find_library(KOMPUTE_LIBRARY
-             NAMES kompute
-             HINTS ${KOMPUTE_LIBRARY_ROOT})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
-
-if(KOMPUTE_FOUND)
-    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
-    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
-endif()
diff --git a/kompute/scripts/convert_shaders.py b/kompute/scripts/convert_shaders.py
deleted file mode 100755
index 11a3ab974..000000000
--- a/kompute/scripts/convert_shaders.py
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env python3
-"""
-    Script to handle conversion of compute shaders to spirv and to headers
-"""
-import os
-import sys
-import logging
-import click
-import subprocess
-
-logger = logging.getLogger(__name__)
-logger.addHandler(logging.StreamHandler())
-
-is_windows = sys.platform.startswith('win')
-
-CWD=os.path.dirname(os.path.abspath(__file__))
-XXD_LINUX_CMD="xxd"
-XXD_WINDOWS_CMD=os.path.abspath(os.path.join(CWD, "..\\external\\bin\\", "xxd.exe"))
-
-SHADER_GENERATED_NOTICE = """/*
-    THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT
-
-    ---
-
-    Copyright 2020 The Institute for Ethical AI & Machine Learning
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-*/
-"""
-
-@click.command()
-@click.option(
-    "--shader-path",
-    "-p",
-    envvar="KOMPUTE_SHADER_PATH",
-    required=True,
-    help="The path for the directory to build and convert shaders",
-)
-@click.option(
-    "--shader-binary",
-    "-s",
-    envvar="KOMPUTE_SHADER_BINARY",
-    required=True,
-    help="The path for the directory to build and convert shaders",
-)
-@click.option(
-    "--header-path",
-    "-c",
-    envvar="KOMPUTE_HEADER_PATH",
-    default="",
-    required=False,
-    help="The (optional) output file for the cpp header files",
-)
-@click.option(
-    "--verbose",
-    "-v",
-    envvar="KOMPUTE_HEADER_PATH",
-    default=False,
-    is_flag=True,
-    help="Enable versbosity if flag is provided",
-)
-def run_cli(
-    shader_path: str = None,
-    shader_binary: str = None,
-    header_path: bool = None,
-    verbose: bool = None,
-):
-    """
-    CLI function for shader generation
-    """
-
-    if verbose:
-        logger.setLevel(logging.DEBUG)
-    else:
-        logger.setLevel(logging.WARNING)
-
-    logger.debug(f"Starting script with variables: {locals()}")
-
-    if is_windows:
-        logger.debug(f"Running on windows, converting input paths")
-        shader_path = shader_path.replace("/", "\\")
-        header_path = header_path.replace("/", "\\")
-
-    shader_files = []
-    for root, directory, files in os.walk(shader_path):
-        for file in files:
-            if file.endswith(".comp"):
-                shader_files.append(os.path.join(root, file))
-
-    run_cmd = lambda *args: subprocess.check_output([*args]).decode()
-
-    logger.debug(f"Output spirv path: {shader_path}")
-    logger.debug(f"Converting files to spirv: {shader_files}")
-
-    spirv_files = []
-    for file in shader_files:
-        logger.debug(f"Converting to spirv: {file}")
-        spirv_file = f"{file}.spv"
-        run_cmd(shader_binary, "-V", file, "-o", spirv_file)
-        spirv_files.append(spirv_file)
-
-    # Create cpp files if header_path provided
-    if header_path:
-        logger.debug(f"Header path provided. Converting bin files to hpp.")
-        logger.debug(f"Output header path: {shader_path}")
-
-        # Check if xxd command options are available
-        if is_windows:
-            xxd_cmd = XXD_WINDOWS_CMD
-        else:
-            xxd_cmd = XXD_LINUX_CMD
-
-        for file in spirv_files:
-            print(xxd_cmd)
-            header_data = str(run_cmd(xxd_cmd, "-i", file))
-            # Ensuring the variable is a static const unsigned
-            header_data = header_data.replace("unsigned", "static const unsigned")
-            if is_windows:
-                raw_file_name = file.split("\\")[-1]
-            else:
-                raw_file_name = file.split("/")[-1]
-            file_name = f"shader{raw_file_name}"
-            header_file = file_name.replace(".comp.spv", ".hpp")
-            header_file_define = "SHADEROP_" + header_file.replace(".", "_").upper()
-            logger.debug(f"Converting to hpp: {file_name}")
-            with open(os.path.join(header_path, header_file), "w+", newline='\n') as fstream:
-                fstream.write(f"{SHADER_GENERATED_NOTICE}\n")
-                fstream.write(f"#ifndef {header_file_define}\n")
-                fstream.write(f"#define {header_file_define}\n\n")
-                fstream.write("namespace kp {\n")
-                fstream.write("namespace shader_data {\n")
-                fstream.write(f"{header_data}")
-                fstream.write("}\n")
-                fstream.write("}\n")
-                fstream.write(f"#endif // define {header_file_define}\n")
-
-
-if __name__ == "__main__":
-    run_cli()
diff --git a/kompute/scripts/requirements.txt b/kompute/scripts/requirements.txt
deleted file mode 100644
index 4da042504..000000000
--- a/kompute/scripts/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# CLI dependencies
-click==7.1.2
-
-# Dev dependencies
-black==19.10b0
-quom==1.2.0
-Sphinx==3.2.1
-sphinx_material==0.0.30
-breathe==4.20.0
-m2r2==0.2.5
-git+git://github.com/pybind/pybind11_mkdoc.git@master
diff --git a/kompute/setup.py b/kompute/setup.py
deleted file mode 100644
index 09faa8d1a..000000000
--- a/kompute/setup.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import os
-import re
-import platform
-import sys
-import sysconfig
-import subprocess
-
-from setuptools import setup, Extension
-from setuptools.command.build_ext import build_ext
-from distutils.version import LooseVersion
-
-curr_dir = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(curr_dir, 'README.md'), encoding='utf-8') as f:
-    long_description = f.read()
-
-class CMakeExtension(Extension):
-    def __init__(self, name, sourcedir=''):
-        Extension.__init__(self, name, sources=[])
-        self.sourcedir = os.path.abspath(sourcedir)
-
-
-class CMakeBuild(build_ext):
-    def run(self):
-        try:
-            out = subprocess.check_output(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError("CMake must be installed to build the following extensions: " +
-                               ", ".join(e.name for e in self.extensions))
-
-        cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
-        if cmake_version < '3.15':
-            raise RuntimeError("CMake >= 3.15 is required")
-
-        for ext in self.extensions:
-            self.build_extension(ext)
-
-    def build_extension(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-        # required for auto-detection of auxiliary "native" libs
-        if not extdir.endswith(os.path.sep):
-            extdir += os.path.sep
-
-        cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
-                      '-DKOMPUTE_OPT_BUILD_PYTHON=ON',
-                      '-DKOMPUTE_OPT_LOG_LEVEL=Off',
-                      '-DKOMPUTE_OPT_USE_SPDLOG=Off',
-                      '-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
-                      '-DPYTHON_EXECUTABLE=' + sys.executable,
-                      '-DPYTHON_INCLUDE_DIR=' + sysconfig.get_path('include'),
-                      '-DPYTHON_LIBRARY=' + sysconfig.get_path('stdlib'),
-        ]
-
-        cfg = 'Debug' if self.debug else 'Release'
-        build_args = ['--config', cfg]
-
-        env = os.environ.copy()
-        oldCxxFlags = env.get('CXXFLAGS', '')
-        env['CXXFLAGS'] = f'{oldCxxFlags} -DVERSION_INFO=\\"{self.distribution.get_version()}\\"'
-
-        if platform.system() == "Windows":
-            cmake_args += [f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}']
-            if sys.maxsize > 2**32:
-                cmake_args += ['-A', 'x64']
-            build_args += ['--', '/m']
-        else:
-            env['CXXFLAGS'] += ' -fPIC'
-            cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-            build_args += ['--', '-j']
-            # Optional environment variable to limit the number of parallel jobs for GitHub actions to reduce RAM usage
-            if 'KOMPUTE_PYTHON_NUM_PARALLEL_THREADS' in env:
-                build_args += env['KOMPUTE_PYTHON_NUM_PARALLEL_THREADS']
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
-        subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
-
-setup(
-    name='kp',
-    version='0.8.1',
-    author='Alejandro Saucedo',
-    description='Kompute: Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU processing usecases.',
-    long_description=long_description,
-    long_description_content_type='text/markdown',
-    ext_modules=[CMakeExtension('kp')],
-    install_requires=[
-        "numpy<2.0.0"
-    ],
-    cmdclass=dict(build_ext=CMakeBuild),
-    zip_safe=False,
-    include_package_data=True,
-)
diff --git a/kompute/src/Algorithm.cpp b/kompute/src/Algorithm.cpp
deleted file mode 100644
index c2d8554e1..000000000
--- a/kompute/src/Algorithm.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#include <fstream>
-
-#include "kompute/Algorithm.hpp"
-
-namespace kp {
-
-Algorithm::~Algorithm()
-{
-    KP_LOG_DEBUG("Kompute Algorithm Destructor started");
-
-    this->destroy();
-}
-
-bool
-Algorithm::isInit()
-{
-    return this->mPipeline && this->mPipelineCache && this->mPipelineLayout &&
-           this->mDescriptorPool && this->mDescriptorSet &&
-           this->mDescriptorSetLayout && this->mShaderModule;
-}
-
-void
-Algorithm::destroy()
-{
-    // We don't have to free memory on destroy as it's freed by the
-    // commandBuffer destructor if (this->mPushConstantsData) {
-    //     free(this->mPushConstantsData);
-    // }
-    // if (this->mSpecializationConstantsData) {
-    //     free(this->mSpecializationConstantsData);
-    // }
-
-    if (!this->mDevice) {
-        KP_LOG_WARN("Kompute Algorithm destroy function reached with null "
-                    "Device pointer");
-        return;
-    }
-
-    if (this->mFreePipeline && this->mPipeline) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline");
-        if (!this->mPipeline) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipeline,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipeline = nullptr;
-    }
-
-    if (this->mFreePipelineLayout && this->mPipelineLayout) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying pipeline layout");
-        if (!this->mPipelineLayout) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "pipeline layout but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineLayout,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineLayout = nullptr;
-    }
-
-    if (this->mFreeShaderModule && this->mShaderModule) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying shader module");
-        if (!this->mShaderModule) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy shader "
-                        "module but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mShaderModule,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mShaderModule = nullptr;
-    }
-
-    freeParameters();
-}
-
-void
-Algorithm::freeParameters()
-{
-    if (this->mFreeDescriptorSetLayout && this->mDescriptorSetLayout) {
-        KP_LOG_DEBUG("Kompute Algorithm Destroying Descriptor Set Layout");
-        if (!this->mDescriptorSetLayout) {
-            KP_LOG_WARN("Kompute Algorithm Error requested to destroy "
-                        "descriptor set layout but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mDescriptorSetLayout,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mDescriptorSetLayout = nullptr;
-    }
-}
-
-void
-Algorithm::createParameters()
-{
-    KP_LOG_DEBUG("Kompute Algorithm createParameters started");
-    if (!*this->mDescriptorPool) {
-        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
-        return;
-    }
-
-    std::vector<vk::DescriptorSetLayoutBinding> descriptorSetBindings;
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        descriptorSetBindings.push_back(
-          vk::DescriptorSetLayoutBinding(i, // Binding index
-                                         vk::DescriptorType::eStorageBuffer,
-                                         1, // Descriptor count
-                                         vk::ShaderStageFlagBits::eCompute));
-    }
-
-    // This is the component that is fed into the pipeline
-    vk::DescriptorSetLayoutCreateInfo descriptorSetLayoutInfo(
-      vk::DescriptorSetLayoutCreateFlags(),
-      static_cast<uint32_t>(descriptorSetBindings.size()),
-      descriptorSetBindings.data());
-
-    KP_LOG_DEBUG("Kompute Algorithm creating descriptor set layout");
-    this->mDescriptorSetLayout = std::make_shared<vk::DescriptorSetLayout>();
-    vk::Result result = this->mDevice->createDescriptorSetLayout(
-      &descriptorSetLayoutInfo, nullptr, this->mDescriptorSetLayout.get());
-
-   if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to create descriptor set layout. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSetLayout = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor set layout.");
-    }
-
-    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
-      *this->mDescriptorPool,
-      1, // Descriptor set layout count
-      this->mDescriptorSetLayout.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
-    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
-    result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
-                                          this->mDescriptorSet.get());
-
-    if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSet = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
-    }
-
-    this->mFreeDescriptorSet = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
-
-        vk::DescriptorBufferInfo descriptorBufferInfo =
-          this->mTensors[i]->constructDescriptorBufferInfo();
-
-        computeWriteDescriptorSets.push_back(
-          vk::WriteDescriptorSet(*this->mDescriptorSet,
-                                 i, // Destination binding
-                                 0, // Destination array element
-                                 1, // Descriptor count
-                                 vk::DescriptorType::eStorageBuffer,
-                                 nullptr, // Descriptor image info
-                                 &descriptorBufferInfo));
-
-        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
-                                            nullptr);
-    }
-
-    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
-}
-
-void
-Algorithm::updateParameters()
-{
-    KP_LOG_DEBUG("Kompute Algorithm updateParameters started");
-    if (!*this->mDescriptorPool) {
-        KP_LOG_ERROR("Kompute Algorithm can not create descriptor pool");
-        return;
-    }
-
-    vk::DescriptorSetAllocateInfo descriptorSetAllocateInfo(
-      *this->mDescriptorPool,
-      1, // Descriptor set layout count
-      this->mDescriptorSetLayout.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm allocating descriptor sets");
-    this->mDescriptorSet = std::make_shared<vk::DescriptorSet>();
-    vk::Result result = this->mDevice->allocateDescriptorSets(&descriptorSetAllocateInfo,
-                                          this->mDescriptorSet.get());
-
-    if (result != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Failed to allocate descriptor sets. Error code: {}", vk::to_string(result));
-    } else {
-        this->mFreeDescriptorSet = true;
-        KP_LOG_DEBUG("Successfully allocated descriptor sets.");
-    }
-
-    this->mFreeDescriptorSet = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm updating descriptor sets");
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        std::vector<vk::WriteDescriptorSet> computeWriteDescriptorSets;
-
-        vk::DescriptorBufferInfo descriptorBufferInfo =
-          this->mTensors[i]->constructDescriptorBufferInfo();
-
-        computeWriteDescriptorSets.push_back(
-          vk::WriteDescriptorSet(*this->mDescriptorSet,
-                                 i, // Destination binding
-                                 0, // Destination array element
-                                 1, // Descriptor count
-                                 vk::DescriptorType::eStorageBuffer,
-                                 nullptr, // Descriptor image info
-                                 &descriptorBufferInfo));
-
-        this->mDevice->updateDescriptorSets(computeWriteDescriptorSets,
-                                            nullptr);
-    }
-
-    KP_LOG_DEBUG("Kompute Algorithm successfully run init");
-}
-
-void
-Algorithm::createShaderModule()
-{
-    KP_LOG_DEBUG("Kompute Algorithm createShaderModule started");
-
-    vk::ShaderModuleCreateInfo shaderModuleInfo(vk::ShaderModuleCreateFlags(),
-                                                sizeof(uint32_t) *
-                                                  this->mSpirv.size(),
-                                                this->mSpirv.data());
-
-    KP_LOG_DEBUG("Kompute Algorithm Creating shader module. ShaderFileSize: {}",
-                 this->mSpirv.size());
-    this->mFreeShaderModule = true;
-    this->mShaderModule = std::make_shared<vk::ShaderModule>();
-    this->mDevice->createShaderModule(
-      &shaderModuleInfo, nullptr, this->mShaderModule.get());
-    this->mFreeShaderModule = true;
-
-    KP_LOG_DEBUG("Kompute Algorithm create shader module success");
-}
-
-void
-Algorithm::createPipeline()
-{
-    KP_LOG_DEBUG("Kompute Algorithm calling create Pipeline");
-
-    vk::PipelineLayoutCreateInfo pipelineLayoutInfo(
-      vk::PipelineLayoutCreateFlags(),
-      1, // Set layout count
-      this->mDescriptorSetLayout.get());
-
-    vk::PushConstantRange pushConstantRange;
-    if (this->mPushConstantsSize) {
-        pushConstantRange.setStageFlags(vk::ShaderStageFlagBits::eCompute);
-        pushConstantRange.setOffset(0);
-        pushConstantRange.setSize(this->mPushConstantsDataTypeMemorySize *
-                                  this->mPushConstantsSize);
-
-        pipelineLayoutInfo.setPushConstantRangeCount(1);
-        pipelineLayoutInfo.setPPushConstantRanges(&pushConstantRange);
-    }
-
-    this->mPipelineLayout = std::make_shared<vk::PipelineLayout>();
-    this->mDevice->createPipelineLayout(
-      &pipelineLayoutInfo, nullptr, this->mPipelineLayout.get());
-    this->mFreePipelineLayout = true;
-
-    std::vector<vk::SpecializationMapEntry> specializationEntries;
-
-    for (uint32_t i = 0; i < this->mSpecializationConstantsSize; i++) {
-        vk::SpecializationMapEntry specializationEntry(
-          static_cast<uint32_t>(i),
-          static_cast<uint32_t>(
-            this->mSpecializationConstantsDataTypeMemorySize * i),
-          this->mSpecializationConstantsDataTypeMemorySize);
-
-        specializationEntries.push_back(specializationEntry);
-    }
-
-    // This passes ownership of the memory so we remove ownership from
-    // specialization container by using "transferDataOwnership"
-    vk::SpecializationInfo specializationInfo(
-      static_cast<uint32_t>(specializationEntries.size()),
-      specializationEntries.data(),
-      this->mSpecializationConstantsDataTypeMemorySize *
-        this->mSpecializationConstantsSize,
-      this->mSpecializationConstantsData);
-
-    vk::PipelineShaderStageCreateInfo shaderStage(
-      vk::PipelineShaderStageCreateFlags(),
-      vk::ShaderStageFlagBits::eCompute,
-      *this->mShaderModule,
-      "main",
-      &specializationInfo);
-
-    vk::ComputePipelineCreateInfo pipelineInfo(vk::PipelineCreateFlags(),
-                                               shaderStage,
-                                               *this->mPipelineLayout,
-                                               vk::Pipeline(),
-                                               0);
-
-#ifdef KOMPUTE_CREATE_PIPELINE_RESULT_VALUE
-    vk::ResultValue<vk::Pipeline> pipelineResult =
-      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo);
-
-    if (pipelineResult.result != vk::Result::eSuccess) {
-        throw std::runtime_error("Failed to create pipeline result: " +
-                                 vk::to_string(pipelineResult.result));
-    }
-
-    vk::Pipeline& pipeline = pipelineResult.value;
-    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
-    this->mFreePipeline = true;
-#else
-    vk::Pipeline pipeline =
-      this->mDevice->createComputePipeline(*mPipelineCache, pipelineInfo)
-        .value;
-    this->mPipeline = std::make_shared<vk::Pipeline>(pipeline);
-    this->mFreePipeline = true;
-#endif
-
-    // TODO: Update to consistent
-    // this->mPipeline = std::make_shared<vk::Pipeline>();
-    // this->mDevice->createComputePipelines(
-    //         *this->mPipelineCache, 1, &pipelineInfo, nullptr,
-    //         this->mPipeline.get());
-
-    KP_LOG_DEBUG("Kompute Algorithm Create Pipeline Success");
-}
-
-void
-Algorithm::recordBindCore(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute Algorithm binding pipeline");
-
-    commandBuffer.bindPipeline(vk::PipelineBindPoint::eCompute,
-                               *this->mPipeline);
-
-    KP_LOG_DEBUG("Kompute Algorithm binding descriptor sets");
-
-    commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
-                                     *this->mPipelineLayout,
-                                     0, // First set
-                                     *this->mDescriptorSet,
-                                     nullptr // Dispatcher
-    );
-}
-
-void
-Algorithm::recordBindPush(const vk::CommandBuffer& commandBuffer)
-{
-    if (this->mPushConstantsSize) {
-        KP_LOG_DEBUG("Kompute Algorithm binding push constants memory size: {}",
-                     this->mPushConstantsSize *
-                       this->mPushConstantsDataTypeMemorySize);
-
-        commandBuffer.pushConstants(*this->mPipelineLayout,
-                                    vk::ShaderStageFlagBits::eCompute,
-                                    0,
-                                    this->mPushConstantsSize *
-                                      this->mPushConstantsDataTypeMemorySize,
-                                    this->mPushConstantsData);
-    }
-}
-
-void
-Algorithm::recordDispatch(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute Algorithm recording dispatch");
-
-    commandBuffer.dispatch(
-      this->mWorkgroup[0], this->mWorkgroup[1], this->mWorkgroup[2]);
-}
-
-void
-Algorithm::setWorkgroup(const Workgroup& workgroup, uint32_t minSize)
-{
-    KP_LOG_INFO("Kompute OpAlgoCreate setting dispatch size");
-
-    // The dispatch size is set up based on either explicitly provided template
-    // parameters or by default it would take the shape and size of the tensors
-    if (workgroup[0] > 0) {
-        // If at least the x value is provided we use mainly the parameters
-        // provided
-        this->mWorkgroup = { workgroup[0],
-                             workgroup[1] > 0 ? workgroup[1] : 1,
-                             workgroup[2] > 0 ? workgroup[2] : 1 };
-    } else {
-        this->mWorkgroup = { minSize, 1, 1 };
-    }
-
-    KP_LOG_INFO("Kompute OpAlgoCreate set dispatch size X: {}, Y: {}, Z: {}",
-                this->mWorkgroup[0],
-                this->mWorkgroup[1],
-                this->mWorkgroup[2]);
-}
-
-const Workgroup&
-Algorithm::getWorkgroup()
-{
-    return this->mWorkgroup;
-}
-
-const std::vector<std::shared_ptr<Tensor>>&
-Algorithm::getTensors()
-{
-    return this->mTensors;
-}
-
-void Algorithm::setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    this->mTensors = tensors;
-}
-
-}
diff --git a/kompute/src/CMakeLists.txt b/kompute/src/CMakeLists.txt
deleted file mode 100644
index 42b7d07f5..000000000
--- a/kompute/src/CMakeLists.txt
+++ /dev/null
@@ -1,86 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.20)
-
-if(KOMPUTE_OPT_ANDROID_BUILD)
-    find_library(android android)
-endif()
-
-cmake_minimum_required(VERSION 3.20)
-
-add_library(kompute STATIC Algorithm.cpp
-    Manager.cpp
-    OpAlgoDispatch.cpp
-    OpMemoryBarrier.cpp
-    OpTensorCopy.cpp
-    OpTensorFill.cpp
-    OpTensorSyncDevice.cpp
-    OpTensorSyncLocal.cpp
-    OpBufferSyncDevice.cpp
-    OpBufferSyncLocal.cpp
-    Sequence.cpp
-    Tensor.cpp
-    Core.cpp)
-
-add_library(kompute::kompute ALIAS kompute)
-
-# Set version for shared libraries.
-set_target_properties(kompute
-    PROPERTIES
-    VERSION ${${PROJECT_NAME}_VERSION}
-    SOVERSION ${${PROJECT_NAME}_VERSION_MAJOR}
-    POSITION_INDEPENDENT_CODE TRUE)
-
-# Import GNU common install directory variables
-include(GNUInstallDirs)
-
-install(TARGETS kompute
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR})
-
-# Include CMake helpers for package config files
-# Follow this installation guideline: https://cmake.org/cmake/help/latest/manual/cmake-packages.7.html
-include(CMakePackageConfigHelpers)
-
-configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/komputeConfig.cmake.in
-    "${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake"
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
-
-#install(FILES ${PROJECT_BINARY_DIR}/kompute/komputeConfig.cmake
-#    ${PROJECT_BINARY_DIR}/kompute/komputeConfigVersion.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/kompute)
-
-# ####################################################
-# Linking
-# ####################################################
-if(KOMPUTE_OPT_ANDROID_BUILD)
-    target_link_libraries(kompute PUBLIC vulkanAndroid
-        android
-        kp_logger
-        kp_shader
-        fmt::fmt-header-only)
-else()
-    target_link_libraries(kompute PUBLIC
-        kp_logger
-        kp_shader
-        fmt::fmt-header-only)
-endif()
-
-if(KOMPUTE_OPT_BUILD_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIRS})
-
-    target_link_libraries(kompute PRIVATE pybind11::headers ${PYTHON_LIBRARIES})
-endif()
-
-if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
-    target_link_libraries(kompute PUBLIC Vulkan-Headers)
-else()
-    target_link_libraries(kompute PUBLIC Vulkan::Headers)
-endif()
-
-# ####################################################
-# Misc
-# ####################################################
-add_subdirectory(logger)
-add_subdirectory(shaders)
-add_subdirectory(include)
diff --git a/kompute/src/Core.cpp b/kompute/src/Core.cpp
deleted file mode 100644
index 020f44160..000000000
--- a/kompute/src/Core.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Core.hpp"
-
-#ifndef KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#define KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-/**
- * Ensures support for dynamic loading of Vulkan functions on Android.
- * Acts as a default store for loaded functions.
- * More information:
- * https://github.com/KhronosGroup/Vulkan-Hpp#vulkan_hpp_default_dispatcher
- **/
-VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-#endif // !KOMPUTE_VK_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
-
-namespace kp {
-} // namespace kp
diff --git a/kompute/src/Manager.cpp b/kompute/src/Manager.cpp
deleted file mode 100644
index 0c588e19b..000000000
--- a/kompute/src/Manager.cpp
+++ /dev/null
@@ -1,512 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Manager.hpp"
-#include "fmt/format.h"
-#include "kompute/logger/Logger.hpp"
-#include <fmt/core.h>
-#include <iterator>
-#include <set>
-#include <sstream>
-#include <string>
-
-namespace kp {
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-static VKAPI_ATTR VkBool32 VKAPI_CALL
-debugMessageCallback(VkDebugReportFlagsEXT /*flags*/,
-                     VkDebugReportObjectTypeEXT /*objectType*/,
-                     uint64_t /*object*/,
-                     size_t /*location*/,
-                     int32_t /*messageCode*/,
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
-                     const char* pLayerPrefix,
-                     const char* pMessage,
-#else
-                     const char* /*pLayerPrefix*/,
-                     const char* /*pMessage*/,
-#endif
-                     void* /*pUserData*/)
-{
-    KP_LOG_DEBUG("[VALIDATION]: {} - {}", pLayerPrefix, pMessage);
-    return VK_FALSE;
-}
-#endif
-
-Manager::Manager()
-{
-    this->mManageResources = true;
-
-// Make sure the logger is setup
-#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
-    logger::setupLogger();
-#endif
-    this->createInstance();
-}
-
-void Manager::initializeDevice(uint32_t physicalDeviceIndex,
-                               const std::vector<uint32_t>& familyQueueIndices,
-                               const std::vector<std::string>& desiredExtensions)
-{
-    this->createDevice(
-      familyQueueIndices, physicalDeviceIndex, desiredExtensions);
-}
-
-Manager::~Manager()
-{
-    KP_LOG_DEBUG("Kompute Manager Destructor started");
-    this->destroy();
-}
-
-void
-Manager::destroy()
-{
-
-    KP_LOG_DEBUG("Kompute Manager destroy() started");
-
-    if (this->mDevice == nullptr) {
-        KP_LOG_ERROR(
-          "Kompute Manager destructor reached with null Device pointer");
-        return;
-    }
-
-    if (this->mManageResources && this->mManagedSequences.size()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly running destructor for "
-                     "managed sequences");
-        for (const std::weak_ptr<Sequence>& weakSq : this->mManagedSequences) {
-            if (std::shared_ptr<Sequence> sq = weakSq.lock()) {
-                sq->destroy();
-            }
-        }
-        this->mManagedSequences.clear();
-    }
-
-    if (this->mManageResources && !this->mManagedAlgorithmsMap.empty()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly freeing algorithms");
-        for (const auto& kv : this->mManagedAlgorithmsMap) {
-            if (std::shared_ptr<Algorithm> algorithm = kv.second) {
-                algorithm->destroy();
-            }
-        }
-        this->mManagedAlgorithmsMap.clear();
-    }
-
-    if (this->mManageResources && this->mManagedTensors.size()) {
-        KP_LOG_DEBUG("Kompute Manager explicitly freeing tensors");
-        for (const std::weak_ptr<Tensor>& weakTensor : this->mManagedTensors) {
-            if (std::shared_ptr<Tensor> tensor = weakTensor.lock()) {
-                tensor->destroy();
-            }
-        }
-        this->mManagedTensors.clear();
-    }
-
-    if (this->mPipelineCache) {
-        KP_LOG_DEBUG("Kompute Manager Destroying pipeline cache");
-        if (!this->mPipelineCache) {
-            KP_LOG_WARN("Kompute Manager Error requested to destroy "
-                        "pipeline cache but it is null");
-        }
-        this->mDevice->destroy(
-          *this->mPipelineCache,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mPipelineCache = nullptr;
-    }
-
-    if (this->mFreeDevice) {
-        KP_LOG_INFO("Destroying device");
-        this->mDevice->destroy(
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mDevice = nullptr;
-        KP_LOG_DEBUG("Kompute Manager Destroyed Device");
-    }
-
-    if (this->mInstance == nullptr) {
-        KP_LOG_ERROR(
-          "Kompute Manager destructor reached with null Instance pointer");
-        return;
-    }
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    if (this->mDebugReportCallback) {
-        this->mInstance->destroyDebugReportCallbackEXT(
-          this->mDebugReportCallback, nullptr, this->mDebugDispatcher);
-        KP_LOG_DEBUG("Kompute Manager Destroyed Debug Report Callback");
-    }
-#endif
-
-    if (this->mFreeInstance) {
-        this->mInstance->destroy(
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-        this->mInstance = nullptr;
-        KP_LOG_DEBUG("Kompute Manager Destroyed Instance");
-    }
-}
-
-void
-Manager::createInstance()
-{
-
-    KP_LOG_DEBUG("Kompute Manager creating instance");
-
-    this->mFreeInstance = true;
-
-    vk::ApplicationInfo applicationInfo;
-    applicationInfo.pApplicationName = "Kompute";
-    applicationInfo.pEngineName = "Kompute";
-    applicationInfo.apiVersion = KOMPUTE_VK_API_VERSION;
-    applicationInfo.engineVersion = KOMPUTE_VK_API_VERSION;
-    applicationInfo.applicationVersion = KOMPUTE_VK_API_VERSION;
-
-    std::vector<const char*> applicationExtensions;
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    applicationExtensions.push_back(VK_EXT_DEBUG_REPORT_EXTENSION_NAME);
-#endif
-
-    vk::InstanceCreateInfo computeInstanceCreateInfo;
-    computeInstanceCreateInfo.pApplicationInfo = &applicationInfo;
-    if (!applicationExtensions.empty()) {
-        computeInstanceCreateInfo.enabledExtensionCount =
-          (uint32_t)applicationExtensions.size();
-        computeInstanceCreateInfo.ppEnabledExtensionNames =
-          applicationExtensions.data();
-    }
-
-    try {
-        mDynamicLoader = std::make_shared<vk::DynamicLoader>();
-    } catch (const std::exception & err) {
-        return;
-    }
-
-    PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr =
-      mDynamicLoader->getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    KP_LOG_DEBUG("Kompute Manager adding debug validation layers");
-    // We'll identify the layers that are supported
-    std::vector<const char*> validLayerNames;
-    std::vector<const char*> desiredLayerNames = {
-        "VK_LAYER_LUNARG_assistant_layer",
-        "VK_LAYER_LUNARG_standard_validation",
-        "VK_LAYER_KHRONOS_validation",
-    };
-    std::vector<std::string> envLayerNames;
-    const char* envLayerNamesVal = std::getenv("KOMPUTE_ENV_DEBUG_LAYERS");
-    if (envLayerNamesVal != nullptr && *envLayerNamesVal != '\0') {
-        KP_LOG_DEBUG("Kompute Manager adding environment layers: {}",
-                     envLayerNamesVal);
-        std::istringstream iss(envLayerNamesVal);
-        std::istream_iterator<std::string> beg(iss);
-        std::istream_iterator<std::string> end;
-        envLayerNames = std::vector<std::string>(beg, end);
-        for (const std::string& layerName : envLayerNames) {
-            desiredLayerNames.push_back(layerName.c_str());
-        }
-        KP_LOG_DEBUG("Desired layers: {}", fmt::join(desiredLayerNames, ", "));
-    }
-
-    // Identify the valid layer names based on the desiredLayerNames
-    {
-        std::set<std::string> uniqueLayerNames;
-        std::vector<vk::LayerProperties> availableLayerProperties =
-          vk::enumerateInstanceLayerProperties();
-        for (vk::LayerProperties layerProperties : availableLayerProperties) {
-            std::string layerName(layerProperties.layerName.data());
-            uniqueLayerNames.insert(layerName);
-        }
-        KP_LOG_DEBUG("Available layers: {}", fmt::join(uniqueLayerNames, ", "));
-        for (const char* desiredLayerName : desiredLayerNames) {
-            if (uniqueLayerNames.count(desiredLayerName) != 0) {
-                validLayerNames.push_back(desiredLayerName);
-            }
-        }
-    }
-
-    if (!validLayerNames.empty()) {
-        KP_LOG_DEBUG(
-          "Kompute Manager Initializing instance with valid layers: {}",
-          fmt::join(validLayerNames, ", "));
-        computeInstanceCreateInfo.enabledLayerCount =
-          static_cast<uint32_t>(validLayerNames.size());
-        computeInstanceCreateInfo.ppEnabledLayerNames = validLayerNames.data();
-    } else {
-        KP_LOG_WARN("Kompute Manager no valid layer names found from desired "
-                    "layer names");
-    }
-#endif
-
-    this->mInstance = std::make_shared<vk::Instance>();
-    vk::Result r = vk::createInstance(
-      &computeInstanceCreateInfo, nullptr, this->mInstance.get());
-    if (r != vk::Result::eSuccess) {
-        KP_LOG_ERROR(
-          "Kompute Manager Error allocating vulkan instance", vk::to_string(r));
-        this->mInstance = nullptr;
-        this->mFreeInstance = false;
-        return;
-    }
-
-    VULKAN_HPP_DEFAULT_DISPATCHER.init(*this->mInstance);
-
-    KP_LOG_DEBUG("Kompute Manager Instance Created");
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    KP_LOG_DEBUG("Kompute Manager adding debug callbacks");
-    if (validLayerNames.size() > 0) {
-        vk::DebugReportFlagsEXT debugFlags =
-          vk::DebugReportFlagBitsEXT::eError |
-          vk::DebugReportFlagBitsEXT::eWarning;
-        vk::DebugReportCallbackCreateInfoEXT debugCreateInfo = {};
-        debugCreateInfo.pfnCallback =
-          (PFN_vkDebugReportCallbackEXT)debugMessageCallback;
-        debugCreateInfo.flags = debugFlags;
-
-        this->mDebugDispatcher.init(*this->mInstance, vkGetInstanceProcAddr);
-        this->mDebugReportCallback =
-          this->mInstance->createDebugReportCallbackEXT(
-            debugCreateInfo, nullptr, this->mDebugDispatcher);
-    }
-#endif
-}
-
-void
-Manager::clear()
-{
-    if (this->mManageResources) {
-        this->mManagedTensors.erase(
-          std::remove_if(begin(this->mManagedTensors),
-                         end(this->mManagedTensors),
-                         [](std::weak_ptr<Tensor> t) { return t.expired(); }),
-          end(this->mManagedTensors));
-        for (auto it = this->mManagedAlgorithmsMap.begin();
-             it != this->mManagedAlgorithmsMap.end();) {
-            if (it->second) {
-                it = this->mManagedAlgorithmsMap.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        this->mManagedSequences.erase(
-          std::remove_if(begin(this->mManagedSequences),
-                         end(this->mManagedSequences),
-                         [](std::weak_ptr<Sequence> t) { return t.expired(); }),
-          end(this->mManagedSequences));
-    }
-}
-
-void
-Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
-                      uint32_t physicalDeviceIndex,
-                      const std::vector<std::string>& desiredExtensions)
-{
-
-    KP_LOG_DEBUG("Kompute Manager creating Device");
-
-    if (this->mInstance == nullptr) {
-        throw std::runtime_error("Kompute Manager instance is null");
-    }
-
-    this->mFreeDevice = true;
-
-    // Getting an integer that says how many vuklan devices we have
-    std::vector<vk::PhysicalDevice> physicalDevices =
-      this->mInstance->enumeratePhysicalDevices();
-    uint32_t deviceCount = physicalDevices.size();
-
-    // This means there are no devices at all
-    if (deviceCount == 0) {
-        throw std::runtime_error("Failed to find GPUs with Vulkan support! "
-                                 "Maybe you haven't installed vulkan drivers?");
-    }
-
-    // This means that we're exceeding our device limit, for
-    // example if we have 2 devices, just physicalDeviceIndex
-    // 0 and 1 are acceptable. Hence, physicalDeviceIndex should
-    // always be less than deviceCount, else we raise an error
-    if (!(deviceCount > physicalDeviceIndex)) {
-        throw std::runtime_error("There is no such physical index or device, "
-                                 "please use your existing device");
-    }
-
-    vk::PhysicalDevice physicalDevice = physicalDevices[physicalDeviceIndex];
-
-    this->mPhysicalDevice =
-      std::make_shared<vk::PhysicalDevice>(physicalDevice);
-
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
-    vk::PhysicalDeviceProperties physicalDeviceProperties =
-      physicalDevice.getProperties();
-#endif
-
-    KP_LOG_INFO("Using physical device index {} found {}",
-                physicalDeviceIndex,
-                physicalDeviceProperties.deviceName.data());
-
-    if (familyQueueIndices.empty()) {
-        // Find compute queue
-        std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
-          physicalDevice.getQueueFamilyProperties();
-
-        uint32_t computeQueueFamilyIndex = 0;
-        bool computeQueueSupported = false;
-        for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
-            vk::QueueFamilyProperties queueFamilyProperties =
-              allQueueFamilyProperties[i];
-
-            if (queueFamilyProperties.queueFlags &
-                vk::QueueFlagBits::eCompute) {
-                computeQueueFamilyIndex = i;
-                computeQueueSupported = true;
-                break;
-            }
-        }
-
-        if (!computeQueueSupported) {
-            throw std::runtime_error("Compute queue is not supported");
-        }
-
-        this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
-    } else {
-        this->mComputeQueueFamilyIndices = familyQueueIndices;
-    }
-
-    std::unordered_map<uint32_t, uint32_t> familyQueueCounts;
-    std::unordered_map<uint32_t, std::vector<float>> familyQueuePriorities;
-    for (const auto& value : this->mComputeQueueFamilyIndices) {
-        familyQueueCounts[value]++;
-        familyQueuePriorities[value].push_back(1.0f);
-    }
-
-    std::unordered_map<uint32_t, uint32_t> familyQueueIndexCount;
-    std::vector<vk::DeviceQueueCreateInfo> deviceQueueCreateInfos;
-    for (const auto& familyQueueInfo : familyQueueCounts) {
-        // Setting the device count to 0
-        familyQueueIndexCount[familyQueueInfo.first] = 0;
-
-        // Creating the respective device queue
-        vk::DeviceQueueCreateInfo deviceQueueCreateInfo(
-          vk::DeviceQueueCreateFlags(),
-          familyQueueInfo.first,
-          familyQueueInfo.second,
-          familyQueuePriorities[familyQueueInfo.first].data());
-        deviceQueueCreateInfos.push_back(deviceQueueCreateInfo);
-    }
-
-    KP_LOG_DEBUG("Kompute Manager desired extension layers {}",
-                 fmt::join(desiredExtensions, ", "));
-
-    std::vector<vk::ExtensionProperties> deviceExtensions =
-      this->mPhysicalDevice->enumerateDeviceExtensionProperties();
-
-    std::set<std::string> uniqueExtensionNames;
-    for (const vk::ExtensionProperties& ext : deviceExtensions) {
-        uniqueExtensionNames.insert(ext.extensionName);
-    }
-    KP_LOG_DEBUG("Kompute Manager available extensions {}",
-                 fmt::join(uniqueExtensionNames, ", "));
-    std::vector<const char*> validExtensions;
-    for (const std::string& ext : desiredExtensions) {
-        if (uniqueExtensionNames.count(ext) != 0) {
-            validExtensions.push_back(ext.c_str());
-        }
-    }
-    if (desiredExtensions.size() != validExtensions.size()) {
-        KP_LOG_ERROR("Kompute Manager not all extensions were added: {}",
-                     fmt::join(validExtensions, ", "));
-    }
-
-    vk::PhysicalDeviceFeatures features;
-    features.shaderInt16 = true;
-
-    vk::PhysicalDeviceVulkan11Features features11;
-    features11.uniformAndStorageBuffer16BitAccess = true;
-    features11.storageBuffer16BitAccess = true;
-    features11.pNext = nullptr;
-
-    vk::PhysicalDeviceVulkan12Features features12;
-    features12.storageBuffer8BitAccess = true;
-    features12.uniformAndStorageBuffer8BitAccess = true;
-    features12.shaderFloat16 = true;
-    features12.shaderInt8 = true;
-    features12.pNext = &features11;
-
-    vk::DeviceCreateInfo deviceCreateInfo(vk::DeviceCreateFlags(),
-                                          deviceQueueCreateInfos.size(),
-                                          deviceQueueCreateInfos.data(),
-                                          {},
-                                          {},
-                                          validExtensions.size(),
-                                          validExtensions.data(),
-                                          &features);
-
-    deviceCreateInfo.pNext = &features12;
-
-    this->mDevice = std::make_shared<vk::Device>();
-    vk::Result r = physicalDevice.createDevice(
-      &deviceCreateInfo, nullptr, this->mDevice.get());
-    if (r != vk::Result::eSuccess) {
-        KP_LOG_ERROR("Kompute Manager could not create device");
-    }
-
-    KP_LOG_DEBUG("Kompute Manager device created");
-
-    for (const uint32_t& familyQueueIndex : this->mComputeQueueFamilyIndices) {
-        std::shared_ptr<vk::Queue> currQueue = std::make_shared<vk::Queue>();
-
-        this->mDevice->getQueue(familyQueueIndex,
-                                familyQueueIndexCount[familyQueueIndex],
-                                currQueue.get());
-
-        familyQueueIndexCount[familyQueueIndex]++;
-
-        this->mComputeQueues.push_back(currQueue);
-    }
-
-    KP_LOG_DEBUG("Kompute Manager compute queue obtained");
-
-    mPipelineCache = std::make_shared<vk::PipelineCache>();
-    vk::PipelineCacheCreateInfo pipelineCacheInfo =
-        vk::PipelineCacheCreateInfo();
-    this->mDevice->createPipelineCache(
-        &pipelineCacheInfo, nullptr, mPipelineCache.get());
-}
-
-std::shared_ptr<Sequence>
-Manager::sequence(uint32_t queueIndex, uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Manager sequence() with queueIndex: {}", queueIndex);
-
-    std::shared_ptr<Sequence> sq{ new kp::Sequence(
-      this->mPhysicalDevice,
-      this->mDevice,
-      this->mComputeQueues[queueIndex],
-      this->mComputeQueueFamilyIndices[queueIndex],
-      totalTimestamps) };
-
-    if (this->mManageResources) {
-        this->mManagedSequences.push_back(sq);
-    }
-
-    return sq;
-}
-
-vk::PhysicalDeviceProperties
-Manager::getDeviceProperties() const
-{
-    return this->mPhysicalDevice->getProperties();
-}
-
-std::vector<vk::PhysicalDevice>
-Manager::listDevices() const
-{
-    return this->mInstance->enumeratePhysicalDevices();
-}
-
-std::shared_ptr<vk::Instance>
-Manager::getVkInstance() const
-{
-    return this->mInstance;
-}
-
-}
diff --git a/kompute/src/OpAlgoDispatch.cpp b/kompute/src/OpAlgoDispatch.cpp
deleted file mode 100644
index edc0f6eb6..000000000
--- a/kompute/src/OpAlgoDispatch.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-
-namespace kp {
-
-OpAlgoDispatch::~OpAlgoDispatch()
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch destructor started");
-
-    if (this->mPushConstantsData) {
-        KP_LOG_DEBUG("Kompute freeing push constants data");
-        free(this->mPushConstantsData);
-    }
-}
-
-void
-OpAlgoDispatch::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    for (const std::shared_ptr<Tensor>& tensor :
-         this->mAlgorithm->getTensors()) {
-        tensor->recordPrimaryBufferMemoryBarrier(
-          commandBuffer,
-          vk::AccessFlagBits::eShaderWrite,
-          vk::AccessFlagBits::eShaderRead,
-          vk::PipelineStageFlagBits::eComputeShader,
-          vk::PipelineStageFlagBits::eComputeShader);
-    }
-
-    if (this->mPushConstantsSize) {
-        this->mAlgorithm->setPushConstants(
-          this->mPushConstantsData,
-          this->mPushConstantsSize,
-          this->mPushConstantsDataTypeMemorySize);
-    }
-
-    this->mAlgorithm->recordBindCore(commandBuffer);
-    this->mAlgorithm->recordBindPush(commandBuffer);
-    this->mAlgorithm->recordDispatch(commandBuffer);
-}
-
-void
-OpAlgoDispatch::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch preEval called");
-}
-
-void
-OpAlgoDispatch::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpAlgoDispatch postSubmit called");
-}
-
-}
diff --git a/kompute/src/OpBufferSyncDevice.cpp b/kompute/src/OpBufferSyncDevice.cpp
deleted file mode 100644
index 1812d04b2..000000000
--- a/kompute/src/OpBufferSyncDevice.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpBufferSyncDevice.hpp"
-
-namespace kp {
-
-OpBufferSyncDevice::OpBufferSyncDevice(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size)
-  : mPrimaryBuffer(primaryBuffer)
-  , mStagingBuffer(stagingBuffer)
-  , mSize(size)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice constructor with params");
-}
-
-OpBufferSyncDevice::~OpBufferSyncDevice()
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice destructor started");
-}
-
-void
-OpBufferSyncDevice::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice record called");
-    vk::BufferCopy copyRegion(0, 0, mSize);
-    commandBuffer.copyBuffer(*mStagingBuffer, *mPrimaryBuffer, copyRegion);
-}
-
-void
-OpBufferSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice preEval called");
-}
-
-void
-OpBufferSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncDevice postEval called");
-}
-
-}
diff --git a/kompute/src/OpBufferSyncLocal.cpp b/kompute/src/OpBufferSyncLocal.cpp
deleted file mode 100644
index a829819fa..000000000
--- a/kompute/src/OpBufferSyncLocal.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpBufferSyncLocal.hpp"
-
-namespace kp {
-
-OpBufferSyncLocal::OpBufferSyncLocal(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size)
-  : mPrimaryBuffer(primaryBuffer)
-  , mStagingBuffer(stagingBuffer)
-  , mSize(size)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal constructor with params");
-}
-
-OpBufferSyncLocal::~OpBufferSyncLocal()
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal destructor started");
-}
-
-void
-OpBufferSyncLocal::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal record called");
-    vk::BufferCopy copyRegion(0, 0, mSize);
-    commandBuffer.copyBuffer(*mPrimaryBuffer, *mStagingBuffer, copyRegion);
-}
-
-void
-OpBufferSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal preEval called");
-}
-
-void
-OpBufferSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpBufferSyncLocal postEval called");
-}
-
-}
diff --git a/kompute/src/OpMemoryBarrier.cpp b/kompute/src/OpMemoryBarrier.cpp
deleted file mode 100644
index 1f075a3c4..000000000
--- a/kompute/src/OpMemoryBarrier.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpMemoryBarrier.hpp"
-
-namespace kp {
-
-OpMemoryBarrier::OpMemoryBarrier(
-  const std::vector<std::shared_ptr<Tensor>>& tensors,
-  const vk::AccessFlagBits& srcAccessMask,
-  const vk::AccessFlagBits& dstAccessMask,
-  const vk::PipelineStageFlagBits& srcStageMask,
-  const vk::PipelineStageFlagBits& dstStageMask,
-  bool barrierOnPrimary)
-  : mSrcAccessMask(srcAccessMask)
-  , mDstAccessMask(dstAccessMask)
-  , mSrcStageMask(srcStageMask)
-  , mDstStageMask(dstStageMask)
-  , mBarrierOnPrimary(barrierOnPrimary)
-  , mTensors(tensors)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier constructor");
-}
-
-OpMemoryBarrier::~OpMemoryBarrier()
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier destructor started");
-}
-
-void
-OpMemoryBarrier::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier record called");
-
-    // Barrier to ensure the data is finished writing to buffer memory
-    if (this->mBarrierOnPrimary) {
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordPrimaryBufferMemoryBarrier(commandBuffer,
-                                                     this->mSrcAccessMask,
-                                                     this->mDstAccessMask,
-                                                     this->mSrcStageMask,
-                                                     this->mDstStageMask);
-        }
-    } else {
-        for (const std::shared_ptr<Tensor>& tensor : this->mTensors) {
-            tensor->recordStagingBufferMemoryBarrier(commandBuffer,
-                                                     this->mSrcAccessMask,
-                                                     this->mDstAccessMask,
-                                                     this->mSrcStageMask,
-                                                     this->mDstStageMask);
-        }
-    }
-}
-
-void
-OpMemoryBarrier::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier preEval called");
-}
-
-void
-OpMemoryBarrier::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpMemoryBarrier postSubmit called");
-}
-
-}
diff --git a/kompute/src/OpTensorCopy.cpp b/kompute/src/OpTensorCopy.cpp
deleted file mode 100644
index 1eaf428b8..000000000
--- a/kompute/src/OpTensorCopy.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorCopy.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-OpTensorCopy::OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy constructor with params");
-
-    this->mTensors = tensors;
-
-    if (this->mTensors.size() < 2) {
-        throw std::runtime_error(
-          "Kompute OpTensorCopy called with less than 2 tensor");
-    }
-
-    kp::Tensor::TensorDataTypes dataType = this->mTensors[0]->dataType();
-    uint32_t size = this->mTensors[0]->size();
-    for (const std::shared_ptr<Tensor>& tensor : tensors) {
-        if (tensor->dataType() != dataType) {
-            throw std::runtime_error(fmt::format(
-              "Attempting to copy tensors of different types from {} to {}",
-              Tensor::toString(dataType),
-              Tensor::toString(tensor->dataType())));
-        }
-        if (tensor->size() != size) {
-            throw std::runtime_error(fmt::format(
-              "Attempting to copy tensors of different sizes from {} to {}",
-              size,
-              tensor->size()));
-        }
-    }
-}
-
-OpTensorCopy::~OpTensorCopy()
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy destructor started");
-}
-
-void
-OpTensorCopy::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy record called");
-
-    // We iterate from the second tensor onwards and record a copy to all
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordCopyFrom(commandBuffer, this->mTensors[0]);
-    }
-}
-
-void
-OpTensorCopy::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy preEval called");
-}
-
-void
-OpTensorCopy::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorCopy postEval called");
-
-    // Do not copy on CPU side if source is storage tensor
-    if (this->mTensors[0]->tensorType() == kp::Tensor::TensorTypes::eStorage)
-    {
-        KP_LOG_DEBUG("Kompute OpTensorCopy not copying tensor source given it's of eStorage type");
-        return;
-    }
-    void* data = this->mTensors[0]->rawData();
-
-    // Copy the data from the first tensor into all the tensors
-    for (size_t i = 1; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == kp::Tensor::TensorTypes::eStorage) {
-            KP_LOG_DEBUG("Kompute OpTensorCopy not copying to tensor dest given it's of eStorage type");
-            continue;
-        }
-        this->mTensors[i]->setRawData(data);
-    }
-}
-
-}
diff --git a/kompute/src/OpTensorFill.cpp b/kompute/src/OpTensorFill.cpp
deleted file mode 100644
index bda7d6040..000000000
--- a/kompute/src/OpTensorFill.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorFill.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-OpTensorFill::OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorFill called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorFill::~OpTensorFill()
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill destructor started");
-}
-
-void
-OpTensorFill::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        this->mTensors[i]->recordFill(commandBuffer, 0);
-    }
-}
-
-void
-OpTensorFill::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill preEval called");
-}
-
-void
-OpTensorFill::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorFill postEval called");
-}
-
-}
diff --git a/kompute/src/OpTensorSyncDevice.cpp b/kompute/src/OpTensorSyncDevice.cpp
deleted file mode 100644
index b563529ea..000000000
--- a/kompute/src/OpTensorSyncDevice.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/operations/OpTensorSyncDevice.hpp"
-
-namespace kp {
-
-OpTensorSyncDevice::OpTensorSyncDevice(
-  const std::vector<std::shared_ptr<Tensor>>& tensors)
-  : mPrimaryBuffer(nullptr)
-  , mStagingBuffer(nullptr)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorSyncDevice called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorSyncDevice::~OpTensorSyncDevice()
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice destructor started");
-
-    this->mTensors.clear();
-}
-
-void
-OpTensorSyncDevice::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-            this->mTensors[i]->recordCopyFromStagingToDevice(commandBuffer);
-        }
-    }
-}
-
-void
-OpTensorSyncDevice::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice preEval called");
-}
-
-void
-OpTensorSyncDevice::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncDevice postEval called");
-}
-
-}
diff --git a/kompute/src/OpTensorSyncLocal.cpp b/kompute/src/OpTensorSyncLocal.cpp
deleted file mode 100644
index 7818db565..000000000
--- a/kompute/src/OpTensorSyncLocal.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpTensorSyncLocal.hpp"
-
-namespace kp {
-
-OpTensorSyncLocal::OpTensorSyncLocal(
-  const std::vector<std::shared_ptr<Tensor>>& tensors)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal constructor with params");
-
-    if (tensors.size() < 1) {
-        throw std::runtime_error(
-          "Kompute OpTensorSyncLocal called with less than 1 tensor");
-    }
-
-    this->mTensors = tensors;
-}
-
-OpTensorSyncLocal::~OpTensorSyncLocal()
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal destructor started");
-}
-
-void
-OpTensorSyncLocal::record(const vk::CommandBuffer& commandBuffer)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal record called");
-
-    for (size_t i = 0; i < this->mTensors.size(); i++) {
-        if (this->mTensors[i]->tensorType() == Tensor::TensorTypes::eDevice) {
-
-            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
-              commandBuffer,
-              vk::AccessFlagBits::eShaderWrite,
-              vk::AccessFlagBits::eTransferRead,
-              vk::PipelineStageFlagBits::eComputeShader,
-              vk::PipelineStageFlagBits::eTransfer);
-
-            this->mTensors[i]->recordCopyFromDeviceToStaging(commandBuffer);
-
-            this->mTensors[i]->recordPrimaryBufferMemoryBarrier(
-              commandBuffer,
-              vk::AccessFlagBits::eTransferWrite,
-              vk::AccessFlagBits::eHostRead,
-              vk::PipelineStageFlagBits::eTransfer,
-              vk::PipelineStageFlagBits::eHost);
-        }
-    }
-}
-
-void
-OpTensorSyncLocal::preEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal preEval called");
-}
-
-void
-OpTensorSyncLocal::postEval(const vk::CommandBuffer& /*commandBuffer*/)
-{
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal postEval called");
-
-    KP_LOG_DEBUG("Kompute OpTensorSyncLocal mapping data into tensor local");
-}
-
-}
diff --git a/kompute/src/Sequence.cpp b/kompute/src/Sequence.cpp
deleted file mode 100644
index da3b379a3..000000000
--- a/kompute/src/Sequence.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Sequence.hpp"
-
-namespace kp {
-
-Sequence::Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-                   std::shared_ptr<vk::Device> device,
-                   std::shared_ptr<vk::Queue> computeQueue,
-                   uint32_t queueIndex,
-                   uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Sequence Constructor with existing device & queue");
-
-    this->mPhysicalDevice = physicalDevice;
-    this->mDevice = device;
-    this->mComputeQueue = computeQueue;
-    this->mQueueIndex = queueIndex;
-
-    this->createCommandPool();
-    this->createCommandBuffer();
-    if (totalTimestamps > 0)
-        this->createTimestampQueryPool(totalTimestamps +
-                                       1); //+1 for the first one
-}
-
-Sequence::~Sequence()
-{
-    KP_LOG_DEBUG("Kompute Sequence Destructor started");
-
-    if (this->mDevice) {
-        this->destroy();
-    }
-}
-
-void
-Sequence::begin()
-{
-    KP_LOG_DEBUG("Kompute sequence called BEGIN");
-
-    if (this->isRecording()) {
-        KP_LOG_DEBUG("Kompute Sequence begin called when already recording");
-        return;
-    }
-
-    if (this->isRunning()) {
-        throw std::runtime_error(
-          "Kompute Sequence begin called when sequence still running");
-    }
-
-    KP_LOG_INFO("Kompute Sequence command now started recording");
-    this->mCommandBuffer->begin(vk::CommandBufferBeginInfo());
-    this->mRecording = true;
-
-    // latch the first timestamp before any commands are submitted
-    if (this->timestampQueryPool)
-        this->mCommandBuffer->writeTimestamp(
-          vk::PipelineStageFlagBits::eAllCommands,
-          *this->timestampQueryPool,
-          0);
-}
-
-void
-Sequence::end()
-{
-    KP_LOG_DEBUG("Kompute Sequence calling END");
-
-    if (this->isRunning()) {
-        throw std::runtime_error(
-          "Kompute Sequence begin called when sequence still running");
-    }
-
-    if (!this->isRecording()) {
-        KP_LOG_WARN("Kompute Sequence end called when not recording");
-        return;
-    } else {
-        KP_LOG_INFO("Kompute Sequence command recording END");
-        this->mCommandBuffer->end();
-        this->mRecording = false;
-    }
-}
-
-void
-Sequence::clear()
-{
-    KP_LOG_DEBUG("Kompute Sequence calling clear");
-    if (this->isRecording()) {
-        this->end();
-    }
-}
-
-std::shared_ptr<Sequence>
-Sequence::eval()
-{
-    KP_LOG_DEBUG("Kompute sequence EVAL BEGIN");
-
-    return this->evalAsync()->evalAwait();
-}
-
-std::shared_ptr<Sequence>
-Sequence::eval(std::shared_ptr<OpBase> op)
-{
-    this->clear();
-    return this->record(op)->eval();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAsync()
-{
-    if (this->isRecording()) {
-        this->end();
-    }
-
-    if (this->mIsRunning) {
-        throw std::runtime_error(
-          "Kompute Sequence evalAsync called when an eval async was "
-          "called without successful wait");
-    }
-
-    this->mIsRunning = true;
-
-    for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->preEval(*this->mCommandBuffer);
-    }
-
-    vk::SubmitInfo submitInfo(
-      0, nullptr, nullptr, 1, this->mCommandBuffer.get());
-
-    this->mFence = this->mDevice->createFence(vk::FenceCreateInfo());
-
-    KP_LOG_DEBUG(
-      "Kompute sequence submitting command buffer into compute queue");
-
-    this->mComputeQueue->submit(1, &submitInfo, this->mFence);
-
-    return shared_from_this();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAsync(std::shared_ptr<OpBase> op)
-{
-    this->clear();
-    this->record(op);
-    this->evalAsync();
-    return shared_from_this();
-}
-
-std::shared_ptr<Sequence>
-Sequence::evalAwait(uint64_t waitFor)
-{
-    if (!this->mIsRunning) {
-        KP_LOG_WARN("Kompute Sequence evalAwait called without existing eval");
-        return shared_from_this();
-    }
-
-    vk::Result result =
-      this->mDevice->waitForFences(1, &this->mFence, VK_TRUE, waitFor);
-    this->mDevice->destroy(
-      this->mFence, (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-    this->mIsRunning = false;
-
-    if (result == vk::Result::eTimeout) {
-        KP_LOG_WARN("Kompute Sequence evalAwait reached timeout of {}",
-                    waitFor);
-        return shared_from_this();
-    }
-
-    for (size_t i = 0; i < this->mOperations.size(); i++) {
-        this->mOperations[i]->postEval(*this->mCommandBuffer);
-    }
-
-    return shared_from_this();
-}
-
-bool
-Sequence::isRunning() const
-{
-    return this->mIsRunning;
-}
-
-bool
-Sequence::isRecording() const
-{
-    return this->mRecording;
-}
-
-bool
-Sequence::isInit() const
-{
-    return this->mDevice && this->mCommandPool && this->mCommandBuffer &&
-           this->mComputeQueue;
-}
-
-void
-Sequence::rerecord()
-{
-    this->end();
-    std::vector<std::shared_ptr<OpBase>> ops = this->mOperations;
-    this->mOperations.clear();
-    for (const std::shared_ptr<kp::OpBase>& op : ops) {
-        this->record(op);
-    }
-}
-
-void
-Sequence::destroy()
-{
-    KP_LOG_DEBUG("Kompute Sequence destroy called");
-
-    if (!this->mDevice) {
-        KP_LOG_WARN("Kompute Sequence destroy called "
-                    "with null Device pointer");
-        return;
-    }
-
-    if (this->mFreeCommandBuffer) {
-        KP_LOG_INFO("Freeing CommandBuffer");
-        if (!this->mCommandBuffer) {
-            KP_LOG_WARN("Kompute Sequence destroy called with null "
-                        "CommandPool pointer");
-            return;
-        }
-        this->mDevice->freeCommandBuffers(
-          *this->mCommandPool, 1, this->mCommandBuffer.get());
-
-        this->mCommandBuffer = nullptr;
-        this->mFreeCommandBuffer = false;
-
-        KP_LOG_DEBUG("Kompute Sequence Freed CommandBuffer");
-    }
-
-    if (this->mFreeCommandPool) {
-        KP_LOG_INFO("Destroying CommandPool");
-        if (this->mCommandPool == nullptr) {
-            KP_LOG_WARN("Kompute Sequence destroy called with null "
-                        "CommandPool pointer");
-            return;
-        }
-        this->mDevice->destroy(
-          *this->mCommandPool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-        this->mCommandPool = nullptr;
-        this->mFreeCommandPool = false;
-
-        KP_LOG_DEBUG("Kompute Sequence Destroyed CommandPool");
-    }
-
-    if (this->mOperations.size()) {
-        KP_LOG_INFO("Kompute Sequence clearing operations buffer");
-        this->mOperations.clear();
-    }
-
-    if (this->timestampQueryPool) {
-        KP_LOG_INFO("Destroying QueryPool");
-        this->mDevice->destroy(
-          *this->timestampQueryPool,
-          (vk::Optional<const vk::AllocationCallbacks>)nullptr);
-
-        this->timestampQueryPool = nullptr;
-        KP_LOG_DEBUG("Kompute Sequence Destroyed QueryPool");
-    }
-
-    if (this->mDevice) {
-        this->mDevice = nullptr;
-    }
-    if (this->mPhysicalDevice) {
-        this->mPhysicalDevice = nullptr;
-    }
-    if (this->mComputeQueue) {
-        this->mComputeQueue = nullptr;
-    }
-}
-
-std::shared_ptr<Sequence>
-Sequence::record(std::shared_ptr<OpBase> op)
-{
-    KP_LOG_DEBUG("Kompute Sequence record function started");
-
-    this->begin();
-
-    KP_LOG_DEBUG(
-      "Kompute Sequence running record on OpBase derived class instance");
-
-    op->record(*this->mCommandBuffer);
-
-    this->mOperations.push_back(op);
-
-    if (this->timestampQueryPool)
-        this->mCommandBuffer->writeTimestamp(
-          vk::PipelineStageFlagBits::eAllCommands,
-          *this->timestampQueryPool,
-          this->mOperations.size());
-
-    return shared_from_this();
-}
-
-void
-Sequence::createCommandPool()
-{
-    KP_LOG_DEBUG("Kompute Sequence creating command pool");
-
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Sequence device is null");
-    }
-
-    this->mFreeCommandPool = true;
-
-    vk::CommandPoolCreateInfo commandPoolInfo(vk::CommandPoolCreateFlags(),
-                                              this->mQueueIndex);
-    this->mCommandPool = std::make_shared<vk::CommandPool>();
-    this->mDevice->createCommandPool(
-      &commandPoolInfo, nullptr, this->mCommandPool.get());
-    KP_LOG_DEBUG("Kompute Sequence Command Pool Created");
-}
-
-void
-Sequence::createCommandBuffer()
-{
-    KP_LOG_DEBUG("Kompute Sequence creating command buffer");
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Sequence device is null");
-    }
-    if (!this->mCommandPool) {
-        throw std::runtime_error("Kompute Sequence command pool is null");
-    }
-
-    this->mFreeCommandBuffer = true;
-
-    vk::CommandBufferAllocateInfo commandBufferAllocateInfo(
-      *this->mCommandPool, vk::CommandBufferLevel::ePrimary, 1);
-
-    this->mCommandBuffer = std::make_shared<vk::CommandBuffer>();
-    this->mDevice->allocateCommandBuffers(&commandBufferAllocateInfo,
-                                          this->mCommandBuffer.get());
-    KP_LOG_DEBUG("Kompute Sequence Command Buffer Created");
-}
-
-void
-Sequence::createTimestampQueryPool(uint32_t totalTimestamps)
-{
-    KP_LOG_DEBUG("Kompute Sequence creating query pool");
-    if (!this->isInit()) {
-        throw std::runtime_error(
-          "createTimestampQueryPool() called on uninitialized Sequence");
-    }
-    if (!this->mPhysicalDevice) {
-        throw std::runtime_error("Kompute Sequence physical device is null");
-    }
-
-    vk::PhysicalDeviceProperties physicalDeviceProperties =
-      this->mPhysicalDevice->getProperties();
-
-    if (physicalDeviceProperties.limits.timestampComputeAndGraphics) {
-        vk::QueryPoolCreateInfo queryPoolInfo;
-        queryPoolInfo.setQueryCount(totalTimestamps);
-        queryPoolInfo.setQueryType(vk::QueryType::eTimestamp);
-        this->timestampQueryPool = std::make_shared<vk::QueryPool>(
-          this->mDevice->createQueryPool(queryPoolInfo));
-
-        KP_LOG_DEBUG("Query pool for timestamps created");
-    } else {
-        throw std::runtime_error("Device does not support timestamps");
-    }
-}
-
-std::vector<std::uint64_t>
-Sequence::getTimestamps()
-{
-    if (!this->timestampQueryPool)
-        throw std::runtime_error("Timestamp latching not enabled");
-
-    const auto n = this->mOperations.size() + 1;
-    std::vector<std::uint64_t> timestamps(n, 0);
-    this->mDevice->getQueryPoolResults(
-      *this->timestampQueryPool,
-      0,
-      n,
-      timestamps.size() * sizeof(std::uint64_t),
-      timestamps.data(),
-      sizeof(uint64_t),
-      vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
-
-    return timestamps;
-}
-
-}
diff --git a/kompute/src/Tensor.cpp b/kompute/src/Tensor.cpp
deleted file mode 100644
index 84dce08e0..000000000
--- a/kompute/src/Tensor.cpp
+++ /dev/null
@@ -1,450 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-std::string
-Tensor::toString(Tensor::TensorDataTypes dt)
-{
-    switch (dt) {
-        case TensorDataTypes::eBool:
-            return "eBool";
-        case TensorDataTypes::eInt:
-            return "eInt";
-        case TensorDataTypes::eUnsignedInt:
-            return "eUnsignedInt";
-        case TensorDataTypes::eFloat:
-            return "eFloat";
-        case TensorDataTypes::eDouble:
-            return "eDouble";
-        default:
-            return "unknown";
-    }
-}
-
-std::string
-Tensor::toString(Tensor::TensorTypes dt)
-{
-    switch (dt) {
-        case TensorTypes::eDevice:
-            return "eDevice";
-        case TensorTypes::eHost:
-            return "eHost";
-        case TensorTypes::eStorage:
-            return "eStorage";
-        default:
-            return "unknown";
-    }
-}
-
-Tensor::Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-               std::shared_ptr<vk::Device> device,
-               void* data,
-               uint32_t elementTotalCount,
-               uint32_t elementMemorySize,
-               const TensorDataTypes& dataType,
-               vk::DeviceMemory *primaryMemory,
-               vk::Buffer *primaryBuffer,
-               vk::DeviceMemory *stagingMemory,
-               vk::Buffer *stagingBuffer,
-               vk::DeviceSize offset,
-               const TensorTypes& tensorType)
-{
-    KP_LOG_DEBUG("Kompute Tensor constructor data length: {}, and type: {}",
-                 elementTotalCount,
-                 Tensor::toString(tensorType));
-
-    this->mPhysicalDevice = physicalDevice;
-    this->mDevice = device;
-    this->mDataType = dataType;
-    this->mTensorType = tensorType;
-
-    this->rebuild(data, elementTotalCount, elementMemorySize, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
-}
-
-Tensor::~Tensor()
-{
-    KP_LOG_DEBUG("Kompute Tensor destructor started. Type: {}",
-                 Tensor::toString(this->tensorType()));
-
-    if (this->mDevice) {
-        this->destroy();
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor destructor success");
-}
-
-void
-Tensor::rebuild(void* /*data*/,
-                uint32_t elementTotalCount,
-                uint64_t memorySize,
-                vk::DeviceMemory *primaryMemory,
-                vk::Buffer *primaryBuffer,
-                vk::DeviceMemory *stagingMemory,
-                vk::Buffer *stagingBuffer,
-                vk::DeviceSize offset)
-{
-    KP_LOG_DEBUG("Kompute Tensor rebuilding with size {}", elementTotalCount);
-
-    this->mSize = elementTotalCount;
-    this->mMemorySize = memorySize;
-    this->mOffset = offset;
-
-    if (this->mPrimaryBuffer || this->mPrimaryMemory) {
-        KP_LOG_DEBUG(
-          "Kompute Tensor destroying existing resources before rebuild");
-        this->destroy();
-    }
-
-    this->setGPUResources(primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, offset);
-}
-
-Tensor::TensorTypes
-Tensor::tensorType()
-{
-    return this->mTensorType;
-}
-
-bool
-Tensor::isInit()
-{
-    return this->mDevice && this->mPrimaryBuffer && this->mPrimaryMemory &&
-           this->mRawData;
-}
-
-uint32_t
-Tensor::size()
-{
-    return this->mSize;
-}
-
-uint64_t
-Tensor::memorySize()
-{
-    return this->mMemorySize;
-}
-
-kp::Tensor::TensorDataTypes
-Tensor::dataType()
-{
-    return this->mDataType;
-}
-
-void*
-Tensor::rawData()
-{
-    return this->mRawData;
-}
-
-void
-Tensor::setRawData(const void* data)
-{
-    memcpy(this->mRawData, data, this->memorySize());
-}
-
-void
-Tensor::recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                       std::shared_ptr<Tensor> copyFromTensor)
-{
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor recordCopyFrom data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           copyFromTensor->mPrimaryBuffer,
-                           this->mPrimaryBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           this->mStagingBuffer,
-                           this->mPrimaryBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    vk::DeviceSize bufferSize(this->memorySize());
-    vk::BufferCopy copyRegion(mOffset, mOffset, bufferSize);
-
-    KP_LOG_DEBUG("Kompute Tensor copying data size {}.", bufferSize);
-
-    this->recordCopyBuffer(commandBuffer,
-                           this->mPrimaryBuffer,
-                           this->mStagingBuffer,
-                           bufferSize,
-                           copyRegion);
-}
-
-void
-Tensor::recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
-                         vk::Buffer *bufferFrom,
-                         vk::Buffer *bufferTo,
-                         vk::DeviceSize /*bufferSize*/,
-                         vk::BufferCopy copyRegion)
-{
-
-    commandBuffer.copyBuffer(*bufferFrom, *bufferTo, copyRegion);
-}
-
-void
-Tensor::recordFill(const vk::CommandBuffer &commandBuffer,
-                   uint32_t fill)
-{
-    commandBuffer.fillBuffer(*this->mPrimaryBuffer, mOffset, this->memorySize(), fill);
-}
-
-void
-Tensor::recordPrimaryBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                         vk::AccessFlagBits srcAccessMask,
-                                         vk::AccessFlagBits dstAccessMask,
-                                         vk::PipelineStageFlagBits srcStageMask,
-                                         vk::PipelineStageFlagBits dstStageMask)
-{
-    KP_LOG_DEBUG("Kompute Tensor recording PRIMARY buffer memory barrier");
-
-    this->recordBufferMemoryBarrier(commandBuffer,
-                                    *this->mPrimaryBuffer,
-                                    srcAccessMask,
-                                    dstAccessMask,
-                                    srcStageMask,
-                                    dstStageMask);
-}
-
-void
-Tensor::recordStagingBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                         vk::AccessFlagBits srcAccessMask,
-                                         vk::AccessFlagBits dstAccessMask,
-                                         vk::PipelineStageFlagBits srcStageMask,
-                                         vk::PipelineStageFlagBits dstStageMask)
-{
-    if (!this->mStagingBuffer)
-        return;
-
-    KP_LOG_DEBUG("Kompute Tensor recording STAGING buffer memory barrier");
-
-    this->recordBufferMemoryBarrier(commandBuffer,
-                                    *this->mStagingBuffer,
-                                    srcAccessMask,
-                                    dstAccessMask,
-                                    srcStageMask,
-                                    dstStageMask);
-}
-
-void
-Tensor::recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                  const vk::Buffer& buffer,
-                                  vk::AccessFlagBits srcAccessMask,
-                                  vk::AccessFlagBits dstAccessMask,
-                                  vk::PipelineStageFlagBits srcStageMask,
-                                  vk::PipelineStageFlagBits dstStageMask)
-{
-    KP_LOG_DEBUG("Kompute Tensor recording buffer memory barrier");
-
-    vk::DeviceSize bufferSize = this->memorySize();
-
-    vk::BufferMemoryBarrier bufferMemoryBarrier;
-    bufferMemoryBarrier.buffer = buffer;
-    bufferMemoryBarrier.size = bufferSize;
-    bufferMemoryBarrier.srcAccessMask = srcAccessMask;
-    bufferMemoryBarrier.dstAccessMask = dstAccessMask;
-    bufferMemoryBarrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-    bufferMemoryBarrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
-
-    commandBuffer.pipelineBarrier(srcStageMask,
-                                  dstStageMask,
-                                  vk::DependencyFlags(),
-                                  nullptr,
-                                  bufferMemoryBarrier,
-                                  nullptr);
-}
-
-vk::DescriptorBufferInfo
-Tensor::constructDescriptorBufferInfo()
-{
-    KP_LOG_DEBUG("Kompute Tensor construct descriptor buffer info size {}",
-                 this->memorySize());
-    vk::DeviceSize bufferSize = this->memorySize();
-    return vk::DescriptorBufferInfo(*this->mPrimaryBuffer,
-                                    mOffset, // offset
-                                    bufferSize);
-}
-
-vk::BufferUsageFlags
-Tensor::getPrimaryBufferUsageFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::BufferUsageFlagBits::eStorageBuffer |
-                   vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        case TensorTypes::eHost:
-            return vk::BufferUsageFlagBits::eStorageBuffer |
-                   vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        case TensorTypes::eStorage:
-            return vk::BufferUsageFlagBits::eStorageBuffer;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::MemoryPropertyFlags
-Tensor::getPrimaryMemoryPropertyFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eDeviceLocal;
-            break;
-        case TensorTypes::eHost:
-            return vk::MemoryPropertyFlagBits::eHostVisible |
-                   vk::MemoryPropertyFlagBits::eHostCoherent;
-            break;
-        case TensorTypes::eStorage:
-            return vk::MemoryPropertyFlagBits::eDeviceLocal;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::BufferUsageFlags
-Tensor::getStagingBufferUsageFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::BufferUsageFlagBits::eTransferSrc |
-                   vk::BufferUsageFlagBits::eTransferDst;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-vk::MemoryPropertyFlags
-Tensor::getStagingMemoryPropertyFlags()
-{
-    switch (this->mTensorType) {
-        case TensorTypes::eDevice:
-            return vk::MemoryPropertyFlagBits::eHostVisible |
-                   vk::MemoryPropertyFlagBits::eHostCoherent;
-            break;
-        default:
-            throw std::runtime_error("Kompute Tensor invalid tensor type");
-    }
-}
-
-void
-Tensor::setGPUResources(vk::DeviceMemory *primaryMemory,
-                        vk::Buffer *primaryBuffer,
-                        vk::DeviceMemory *stagingMemory,
-                        vk::Buffer *stagingBuffer,
-                        vk::DeviceSize /*offset*/)
-{
-    KP_LOG_DEBUG("Kompute Tensor creating buffer");
-
-    if (!this->mPhysicalDevice) {
-        throw std::runtime_error("Kompute Tensor phyisical device is null");
-    }
-    if (!this->mDevice) {
-        throw std::runtime_error("Kompute Tensor device is null");
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor creating primary buffer and memory");
-
-    this->mPrimaryBuffer = primaryBuffer;
-    this->mPrimaryMemory = primaryMemory;
-
-    if (this->mTensorType == TensorTypes::eDevice) {
-        KP_LOG_DEBUG("Kompute Tensor creating staging buffer and memory");
-
-        this->mStagingBuffer = stagingBuffer;
-        this->mStagingMemory = stagingMemory;
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor buffer & memory creation successful");
-}
-
-void
-Tensor::destroy()
-{
-    KP_LOG_DEBUG("Kompute Tensor started destroy()");
-
-    // Setting raw data to null regardless whether device is available to
-    // invalidate Tensor
-    this->mRawData = nullptr;
-    this->mSize = 0;
-    this->mMemorySize = 0;
-
-    if (!this->mDevice) {
-        KP_LOG_WARN(
-          "Kompute Tensor destructor reached with null Device pointer");
-        return;
-    }
-
-    if (this->mDevice) {
-        this->mDevice = nullptr;
-    }
-
-    KP_LOG_DEBUG("Kompute Tensor successful destroy()");
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<bool>::dataType()
-{
-    return Tensor::TensorDataTypes::eBool;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<int32_t>::dataType()
-{
-    return Tensor::TensorDataTypes::eInt;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<uint32_t>::dataType()
-{
-    return Tensor::TensorDataTypes::eUnsignedInt;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<float>::dataType()
-{
-    return Tensor::TensorDataTypes::eFloat;
-}
-
-template<>
-Tensor::TensorDataTypes
-TensorT<double>::dataType()
-{
-    return Tensor::TensorDataTypes::eDouble;
-}
-
-}
diff --git a/kompute/src/include/CMakeLists.txt b/kompute/src/include/CMakeLists.txt
deleted file mode 100644
index 53e9d8ae6..000000000
--- a/kompute/src/include/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-# ####################################################
-# Kompute
-# ####################################################
-target_include_directories(kompute PUBLIC $<INSTALL_INTERFACE:include>
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-
-target_sources(kompute PRIVATE
-
-    # Header files (useful in IDEs)
-    kompute/Algorithm.hpp
-    kompute/Core.hpp
-    kompute/Kompute.hpp
-    kompute/Manager.hpp
-    kompute/Sequence.hpp
-    kompute/Tensor.hpp
-
-    kompute/operations/OpAlgoDispatch.hpp
-    kompute/operations/OpBase.hpp
-    kompute/operations/OpMemoryBarrier.hpp
-    kompute/operations/OpMult.hpp
-    kompute/operations/OpTensorCopy.hpp
-    kompute/operations/OpTensorFill.hpp
-    kompute/operations/OpTensorSyncDevice.hpp
-    kompute/operations/OpTensorSyncLocal.hpp
-    kompute/operations/OpBufferSyncDevice.hpp
-    kompute/operations/OpBufferSyncLocal.hpp
-
-    kompute/logger/Logger.hpp
-)
-
-#install(DIRECTORY kompute DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-# ####################################################
-# Logger
-# ####################################################
-target_include_directories(kp_logger PUBLIC $<INSTALL_INTERFACE:include>
-    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
-
-target_sources(kp_logger PRIVATE
-
-    # Header files (useful in IDEs)
-    kompute/logger/Logger.hpp
-)
-
-#install(DIRECTORY logger DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/kompute/src/include/kompute/Algorithm.hpp b/kompute/src/include/kompute/Algorithm.hpp
deleted file mode 100644
index e5fef1f56..000000000
--- a/kompute/src/include/kompute/Algorithm.hpp
+++ /dev/null
@@ -1,330 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "fmt/format.h"
-#include "kompute/Tensor.hpp"
-#include "logger/Logger.hpp"
-
-namespace kp {
-
-/**
-    Abstraction for compute shaders that are run on top of tensors grouped via
-   ParameterGroups (which group descriptorsets)
-*/
-class Algorithm
-{
-  public:
-    /**
-     *  Main constructor for algorithm with configuration parameters to create
-     *  the underlying resources.
-     *
-     *  @param device The Vulkan device to use for creating resources
-     *  @param tensors (optional) The tensors to use to create the descriptor
-     * resources
-     *  @param spirv (optional) The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
-     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The templatable param is to be
-     * used to initialize the specialization constants which cannot be changed
-     * once set.
-     *  @param pushConstants (optional) This templatable param is to be used
-     * when initializing the pipeline, which set the size of the push constants
-     * - these can be modified but all new values must have the same data type
-     * and length as otherwise it will result in errors.
-     */
-    template<typename S = float, typename P = float>
-    Algorithm(std::shared_ptr<vk::Device> device,
-              vk::PipelineCache *pipelineCache,
-              vk::DescriptorPool *pool,
-              const std::vector<std::shared_ptr<Tensor>>& tensors = {},
-              const std::vector<uint32_t>& spirv = {},
-              const Workgroup& workgroup = {},
-              const std::vector<S>& specializationConstants = {},
-              const std::vector<P>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute Algorithm Constructor with device");
-
-        this->mDevice = device;
-        this->mPipelineCache = pipelineCache;
-        this->mDescriptorPool = pool;
-
-        if (tensors.size() && spirv.size()) {
-            KP_LOG_INFO(
-              "Kompute Algorithm initialising with tensor size: {} and "
-              "spirv size: {}",
-              tensors.size(),
-              spirv.size());
-            this->rebuild(tensors,
-                          spirv,
-                          workgroup,
-                          specializationConstants,
-                          pushConstants);
-        } else {
-            KP_LOG_INFO(
-              "Kompute Algorithm constructor with empty tensors and or "
-              "spirv so not rebuilding vulkan components");
-        }
-    }
-
-    /**
-     *  Rebuild function to reconstruct algorithm with configuration parameters
-     * to create the underlying resources.
-     *
-     *  @param tensors The tensors to use to create the descriptor resources
-     *  @param spirv The spirv code to use to create the algorithm
-     *  @param workgroup (optional) The kp::Workgroup to use for the dispatch
-     * which defaults to kp::Workgroup(tensor[0].size(), 1, 1) if not set.
-     *  @param specializationConstants (optional) The std::vector<float> to use
-     * to initialize the specialization constants which cannot be changed once
-     * set.
-     *  @param pushConstants (optional) The std::vector<float> to use when
-     * initializing the pipeline, which set the size of the push constants -
-     * these can be modified but all new values must have the same vector size
-     * as this initial value.
-     */
-    template<typename S = float, typename P = float>
-    void rebuild(const std::vector<std::shared_ptr<Tensor>>& tensors,
-                 const std::vector<uint32_t>& spirv,
-                 const Workgroup& workgroup = {},
-                 const std::vector<S>& specializationConstants = {},
-                 const std::vector<P>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute Algorithm rebuild started");
-
-        this->mTensors = tensors;
-        this->mSpirv = spirv;
-
-        if (specializationConstants.size()) {
-            if (this->mSpecializationConstantsData) {
-                free(this->mSpecializationConstantsData);
-            }
-            uint32_t memorySize =
-              sizeof(decltype(specializationConstants.back()));
-            uint32_t size = specializationConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mSpecializationConstantsData = malloc(totalSize);
-            memcpy(this->mSpecializationConstantsData,
-                   specializationConstants.data(),
-                   totalSize);
-            this->mSpecializationConstantsDataTypeMemorySize = memorySize;
-            this->mSpecializationConstantsSize = size;
-        }
-
-        if (pushConstants.size()) {
-            if (this->mPushConstantsData) {
-                free(this->mPushConstantsData);
-            }
-            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-            uint32_t size = pushConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mPushConstantsData = malloc(totalSize);
-            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
-            this->mPushConstantsDataTypeMemorySize = memorySize;
-            this->mPushConstantsSize = size;
-        }
-
-        this->setWorkgroup(
-          workgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
-
-        // Descriptor pool is created first so if available then destroy all
-        // before rebuild
-        if (this->isInit()) {
-            this->destroy();
-        }
-
-        this->createParameters();
-        this->createShaderModule();
-        this->createPipeline();
-    }
-
-    /**
-     * Destructor for Algorithm which is responsible for freeing and desroying
-     * respective pipelines and owned parameter groups.
-     */
-    ~Algorithm();
-
-    /**
-     * Records the dispatch function with the provided template parameters or
-     * alternatively using the size of the tensor by default.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordDispatch(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records command that binds the "core" algorithm components which consist
-     * of binding the pipeline and binding the descriptorsets.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordBindCore(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records command that binds the push constants to the command buffer
-     * provided
-     * - it is required that the pushConstants provided are of the same size as
-     * the ones provided during initialization.
-     *
-     * @param commandBuffer Command buffer to record the algorithm resources to
-     */
-    void recordBindPush(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * function that checks all the gpu resource components to verify if these
-     * have been created and returns true if all are valid.
-     *
-     * @returns returns true if the algorithm is currently initialized.
-     */
-    bool isInit();
-
-    /**
-     * Sets the work group to use in the recordDispatch
-     *
-     * @param workgroup The kp::Workgroup value to use to update the algorithm.
-     * It must have a value greater than 1 on the x value (index 1) otherwise it
-     * will be initialized on the size of the first tensor (ie.
-     * this->mTensor[0]->size())
-     */
-    void setWorkgroup(const Workgroup& workgroup, uint32_t minSize = 1);
-    /**
-     * Sets the push constants to the new value provided to use in the next
-     * bindPush()
-     *
-     * @param pushConstants The templatable vector is to be used to set the push
-     * constants to use in the next bindPush(...) calls. The constants provided
-     * must be of the same size as the ones created during initialization.
-     */
-    template<typename T>
-    void setPushConstants(const std::vector<T>& pushConstants)
-    {
-        uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-        uint32_t size = pushConstants.size();
-        this->setPushConstants(pushConstants.data(), size, memorySize);
-    }
-
-    void updateDescriptors(vk::DescriptorPool *pool)
-    {
-        this->mDescriptorPool = pool;
-        this->setWorkgroup(
-          this->mWorkgroup, this->mTensors.size() ? this->mTensors[0]->size() : 1);
-
-        this->updateParameters(); // TODO: See if we can reduce this
-    }
-
-    /**
-     * Sets the push constants to the new value provided to use in the next
-     * bindPush() with the raw memory block location and memory size to be used.
-     *
-     * @param data The raw data point to copy the data from, without modifying
-     * the pointer.
-     * @param size The number of data elements provided in the data
-     * @param memorySize The memory size of each of the data elements in bytes.
-     */
-    void setPushConstants(const void* data, uint32_t size, uint32_t memorySize)
-    {
-
-        uint32_t totalSize = memorySize * size;
-        uint32_t previousTotalSize =
-          this->mPushConstantsDataTypeMemorySize * this->mPushConstantsSize;
-
-        if (totalSize != previousTotalSize) {
-            throw std::runtime_error(fmt::format(
-              "Kompute Algorithm push "
-              "constant total memory size provided is {} but expected {} bytes",
-              totalSize,
-              previousTotalSize));
-        }
-        if (this->mPushConstantsData) {
-            free(this->mPushConstantsData);
-        }
-
-        this->mPushConstantsData = malloc(totalSize);
-        memcpy(this->mPushConstantsData, data, totalSize);
-        this->mPushConstantsDataTypeMemorySize = memorySize;
-        this->mPushConstantsSize = size;
-    }
-
-    /**
-     * Gets the current workgroup from the algorithm.
-     *
-     * @param The kp::Constant to use to set the push constants to use in the
-     * next bindPush(...) calls. The constants provided must be of the same size
-     * as the ones created during initialization.
-     */
-    const Workgroup& getWorkgroup();
-    /**
-     * Gets the specialization constants of the current algorithm.
-     *
-     * @returns The std::vector<float> currently set for specialization
-     * constants
-     */
-    template<typename T>
-    const std::vector<T> getSpecializationConstants()
-    {
-        return { (T*)this->mSpecializationConstantsData,
-                 ((T*)this->mSpecializationConstantsData) +
-                   this->mSpecializationConstantsSize };
-    }
-    /**
-     * Gets the specialization constants of the current algorithm.
-     *
-     * @returns The std::vector<float> currently set for push constants
-     */
-    template<typename T>
-    const std::vector<T> getPushConstants()
-    {
-        return { (T*)this->mPushConstantsData,
-                 ((T*)this->mPushConstantsData) + this->mPushConstantsSize };
-    }
-    /**
-     * Gets the current tensors that are used in the algorithm.
-     *
-     * @returns The list of tensors used in the algorithm.
-     */
-    const std::vector<std::shared_ptr<Tensor>>& getTensors();
-    void setTensors(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    void destroy();
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::Device> mDevice;
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::DescriptorSetLayout> mDescriptorSetLayout;
-    bool mFreeDescriptorSetLayout = false;
-    vk::DescriptorPool *mDescriptorPool = nullptr;
-    std::shared_ptr<vk::DescriptorSet> mDescriptorSet;
-    bool mFreeDescriptorSet = false;
-    std::shared_ptr<vk::ShaderModule> mShaderModule;
-    bool mFreeShaderModule = false;
-    std::shared_ptr<vk::PipelineLayout> mPipelineLayout;
-    bool mFreePipelineLayout = false;
-    vk::PipelineCache *mPipelineCache = nullptr;
-    std::shared_ptr<vk::Pipeline> mPipeline;
-    bool mFreePipeline = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<uint32_t> mSpirv;
-    void* mSpecializationConstantsData = nullptr;
-    uint32_t mSpecializationConstantsDataTypeMemorySize = 0;
-    uint32_t mSpecializationConstantsSize = 0;
-    void* mPushConstantsData = nullptr;
-    uint32_t mPushConstantsDataTypeMemorySize = 0;
-    uint32_t mPushConstantsSize = 0;
-    Workgroup mWorkgroup;
-
-    // Create util functions
-    void createShaderModule();
-    void createPipeline();
-
-    // Parameters
-    void freeParameters();
-    void createParameters();
-    void updateParameters();
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Core.hpp b/kompute/src/include/kompute/Core.hpp
deleted file mode 100644
index 406e6b5d4..000000000
--- a/kompute/src/include/kompute/Core.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <vulkan/vulkan.hpp>
-
-// Typedefs to simplify interaction with core types
-namespace kp {
-typedef std::array<uint32_t, 3> Workgroup;
-typedef std::vector<float> Constants;
-}
-
-// Must be after vulkan is included
-#ifndef KOMPUTE_VK_API_VERSION
-#ifndef KOMPUTE_VK_API_MAJOR_VERSION
-#define KOMPUTE_VK_API_MAJOR_VERSION 1
-#endif // KOMPUTE_VK_API_MAJOR_VERSION
-#ifndef KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_MINOR_VERSION 2
-#endif // KOMPUTE_VK_API_MINOR_VERSION
-#define KOMPUTE_VK_API_VERSION                                                 \
-    VK_MAKE_VERSION(                                                           \
-      KOMPUTE_VK_API_MAJOR_VERSION, KOMPUTE_VK_API_MINOR_VERSION, 0)
-#endif // KOMPUTE_VK_API_VERSION
-
-#if defined(KOMPUTE_BUILD_PYTHON)
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-// from python/src/main.cpp
-extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
-#endif
diff --git a/kompute/src/include/kompute/Kompute.hpp b/kompute/src/include/kompute/Kompute.hpp
deleted file mode 100644
index 70e0dd433..000000000
--- a/kompute/src/include/kompute/Kompute.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "Algorithm.hpp"
-#include "Core.hpp"
-#include "Manager.hpp"
-#include "Sequence.hpp"
-#include "Tensor.hpp"
-
-#include "operations/OpAlgoDispatch.hpp"
-#include "operations/OpBase.hpp"
-#include "operations/OpMemoryBarrier.hpp"
-#include "operations/OpMult.hpp"
-#include "operations/OpTensorCopy.hpp"
-#include "operations/OpTensorSyncDevice.hpp"
-#include "operations/OpTensorSyncLocal.hpp"
-#include "operations/OpBufferSyncDevice.hpp"
-#include "operations/OpBufferSyncLocal.hpp"
-#include "operations/OpTensorFill.hpp"
-
-// Will be build by CMake and placed inside the build directory
-#include "ShaderLogisticRegression.hpp"
-#include "ShaderOpMult.hpp"
diff --git a/kompute/src/include/kompute/Manager.hpp b/kompute/src/include/kompute/Manager.hpp
deleted file mode 100644
index 780c352eb..000000000
--- a/kompute/src/include/kompute/Manager.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <set>
-#include <unordered_map>
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Sequence.hpp"
-#include "logger/Logger.hpp"
-
-#define KP_DEFAULT_SESSION "DEFAULT"
-
-namespace kp {
-
-/**
-    Base orchestrator which creates and manages device and child components
-*/
-class Manager
-{
-  public:
-    /**
-        Base constructor.
-    */
-    Manager();
-
-    /**
-     * Manager destructor which would ensure all owned resources are destroyed
-     * unless explicitly stated that resources should not be destroyed or freed.
-     */
-    ~Manager();
-
-    bool hasInstance() const {
-        return this->mInstance.get();
-    }
-
-    bool hasDevice() const {
-        return this->mDevice.get();
-    }
-
-    bool hasVulkan() const {
-        return this->mDynamicLoader.get();
-    }
-
-    /**
-     * Initialize a device.
-     *
-     * @param physicalDeviceIndex The index of the physical device to use
-     * @param familyQueueIndices (Optional) List of queue indices to add for
-     * explicit allocation
-     * @param desiredExtensions The desired extensions to load from
-     * physicalDevice
-     */
-    void initializeDevice(uint32_t physicalDeviceIndex,
-            const std::vector<uint32_t>& familyQueueIndices = {},
-            const std::vector<std::string>& desiredExtensions = {});
-
-    /**
-     * Create a managed sequence that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param queueIndex The queue to use from the available queues
-     * @param nrOfTimestamps The maximum number of timestamps to allocate.
-     * If zero (default), disables latching of timestamps.
-     * @returns Shared pointer with initialised sequence
-     */
-    std::shared_ptr<Sequence> sequence(uint32_t queueIndex = 0,
-                                       uint32_t totalTimestamps = 0);
-
-    /**
-     * Create a managed tensor that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param data The data to initialize the tensor with
-     * @param tensorType The type of tensor to initialize
-     * @returns Shared pointer with initialised tensor
-     */
-    template<typename T>
-    std::shared_ptr<TensorT<T>> tensorT(
-      const std::vector<T>& data,
-       vk::DeviceMemory *primaryMemory,
-       vk::Buffer *primaryBuffer,
-       vk::DeviceMemory *stagingMemory,
-       vk::Buffer *stagingBuffer,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
-    {
-        KP_LOG_DEBUG("Kompute Manager tensor creation triggered");
-
-        std::shared_ptr<TensorT<T>> tensor{ new kp::TensorT<T>(
-          this->mPhysicalDevice, this->mDevice, data, primaryMemory, primaryBuffer, stagingMemory, stagingBuffer, tensorType) };
-
-        if (this->mManageResources) {
-            this->mManagedTensors.push_back(tensor);
-        }
-
-        return tensor;
-    }
-
-    std::shared_ptr<Tensor> tensor(
-      void* data,
-      uint32_t elementTotalCount,
-      uint64_t memorySize,
-      const Tensor::TensorDataTypes& dataType,
-      vk::DeviceMemory *primaryMemory,
-      vk::Buffer *primaryBuffer,
-      vk::DeviceMemory *stagingMemory,
-      vk::Buffer *stagingBuffer,
-      vk::DeviceSize offset,
-      Tensor::TensorTypes tensorType = Tensor::TensorTypes::eDevice)
-    {
-        std::shared_ptr<Tensor> tensor{ new kp::Tensor(this->mPhysicalDevice,
-                                                       this->mDevice,
-                                                       data,
-                                                       elementTotalCount,
-                                                       memorySize,
-                                                       dataType,
-                                                       primaryMemory,
-                                                       primaryBuffer,
-                                                       stagingMemory,
-                                                       stagingBuffer,
-                                                       offset,
-                                                       tensorType) };
-
-        if (this->mManageResources) {
-            this->mManagedTensors.push_back(tensor);
-        }
-
-        return tensor;
-    }
-
-    /**
-     * Default non-template function that can be used to create algorithm
-     * objects which provides default types to the push and spec constants as
-     * floats.
-     *
-     * @param tensors (optional) The tensors to initialise the algorithm with
-     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
-     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
-     * defaults to (tensor[0].size(), 1, 1)
-     * @param specializationConstants (optional) float vector to use for
-     * specialization constants, and defaults to an empty constant
-     * @param pushConstants (optional) float vector to use for push constants,
-     * and defaults to an empty constant
-     * @returns Shared pointer with initialised algorithm
-     */
-    std::shared_ptr<Algorithm> algorithm(
-      const std::string &name,
-      vk::DescriptorPool *pool,
-      const std::vector<std::shared_ptr<Tensor>>& tensors = {},
-      const std::vector<uint32_t>& spirv = {},
-      const Workgroup& workgroup = {},
-      const std::vector<float>& specializationConstants = {},
-      const std::vector<float>& pushConstants = {})
-    {
-        return this->algorithm<>(
-          name, pool, tensors, spirv, workgroup, specializationConstants, pushConstants);
-    }
-
-    /**
-     * Create a managed algorithm that will be destroyed by this manager
-     * if it hasn't been destroyed by its reference count going to zero.
-     *
-     * @param tensors (optional) The tensors to initialise the algorithm with
-     * @param spirv (optional) The SPIRV bytes for the algorithm to dispatch
-     * @param workgroup (optional) kp::Workgroup for algorithm to use, and
-     * defaults to (tensor[0].size(), 1, 1)
-     * @param specializationConstants (optional) templatable vector parameter to
-     * use for specialization constants, and defaults to an empty constant
-     * @param pushConstants (optional) templatable vector parameter to use for
-     * push constants, and defaults to an empty constant
-     * @returns Shared pointer with initialised algorithm
-     */
-    template<typename S = float, typename P = float>
-    std::shared_ptr<Algorithm> algorithm(
-      const std::string &name,
-      vk::DescriptorPool *pool,
-      const std::vector<std::shared_ptr<Tensor>>& tensors,
-      const std::vector<uint32_t>& spirv,
-      const Workgroup& workgroup,
-      const std::vector<S>& specializationConstants,
-      const std::vector<P>& pushConstants)
-    {
-
-        KP_LOG_DEBUG("Kompute Manager algorithm creation triggered");
-
-        std::shared_ptr<Algorithm> algorithm{ new kp::Algorithm(
-          this->mDevice,
-          mPipelineCache.get(),
-          pool,
-          tensors,
-          spirv,
-          workgroup,
-          specializationConstants,
-          pushConstants) };
-
-        if (this->mManageResources) {
-            this->mManagedAlgorithmsMap.insert({name, algorithm});
-        }
-
-        return algorithm;
-    }
-
-    bool hasAlgorithm(const std::string &name) const {
-        return mManagedAlgorithmsMap.find(name) != mManagedAlgorithmsMap.end();
-    }
-
-    std::shared_ptr<Algorithm> getAlgorithm(const std::string &name) const {
-        auto it = mManagedAlgorithmsMap.find(name);
-        if (it != mManagedAlgorithmsMap.end()) {
-            return it->second;
-        }
-        return nullptr;
-    }
-
-    /**
-     * Destroy the GPU resources and all managed resources by manager.
-     **/
-    void destroy();
-    /**
-     * Run a pseudo-garbage collection to release all the managed resources
-     * that have been already freed due to these reaching to zero ref count.
-     **/
-    void clear();
-
-    /**
-     * Information about the current device.
-     *
-     * @return vk::PhysicalDeviceProperties containing information about the
-     *device
-     **/
-    vk::PhysicalDeviceProperties getDeviceProperties() const;
-
-    /**
-     * List the devices available in the current vulkan instance.
-     *
-     * @return vector of physical devices containing their respective properties
-     **/
-    std::vector<vk::PhysicalDevice> listDevices() const;
-
-    /**
-     * The current Vulkan instance.
-     *
-     * @return a shared pointer to the current Vulkan instance held by this
-     *object
-     **/
-    std::shared_ptr<vk::Instance> getVkInstance() const;
-
-    std::shared_ptr<vk::Device> device() const { return mDevice; }
-    std::shared_ptr<vk::PhysicalDevice> physicalDevice() const { return mPhysicalDevice; }
-    std::shared_ptr<vk::PipelineCache> pipelineCache() const { return mPipelineCache; }
-
-  private:
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::Instance> mInstance = nullptr;
-    bool mFreeInstance = false;
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
-    std::shared_ptr<vk::Device> mDevice = nullptr;
-    std::shared_ptr<vk::DynamicLoader> mDynamicLoader = nullptr;
-    bool mFreeDevice = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::weak_ptr<Tensor>> mManagedTensors;
-    std::vector<std::weak_ptr<Sequence>> mManagedSequences;
-    std::unordered_map<std::string, std::shared_ptr<Algorithm>> mManagedAlgorithmsMap;
-
-    std::vector<uint32_t> mComputeQueueFamilyIndices;
-    std::vector<std::shared_ptr<vk::Queue>> mComputeQueues;
-    std::shared_ptr<vk::PipelineCache> mPipelineCache;
-
-    bool mManageResources = false;
-
-#ifndef KOMPUTE_DISABLE_VK_DEBUG_LAYERS
-    vk::DebugReportCallbackEXT mDebugReportCallback;
-    vk::DispatchLoaderDynamic mDebugDispatcher;
-#endif
-
-    // Create functions
-    void createInstance();
-    void createDevice(const std::vector<uint32_t>& familyQueueIndices = {},
-                      uint32_t physicalDeviceIndex = 0,
-                      const std::vector<std::string>& desiredExtensions = {});
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Sequence.hpp b/kompute/src/include/kompute/Sequence.hpp
deleted file mode 100644
index 3b29a6e2e..000000000
--- a/kompute/src/include/kompute/Sequence.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- *  Container of operations that can be sent to GPU as batch
- */
-class Sequence : public std::enable_shared_from_this<Sequence>
-{
-  public:
-    /**
-     * Main constructor for sequence which requires core vulkan components to
-     * generate all dependent resources.
-     *
-     * @param physicalDevice Vulkan physical device
-     * @param device Vulkan logical device
-     * @param computeQueue Vulkan compute queue
-     * @param queueIndex Vulkan compute queue index in device
-     * @param totalTimestamps Maximum number of timestamps to allocate
-     */
-    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-             std::shared_ptr<vk::Device> device,
-             std::shared_ptr<vk::Queue> computeQueue,
-             uint32_t queueIndex,
-             uint32_t totalTimestamps = 0);
-    /**
-     * Destructor for sequence which is responsible for cleaning all subsequent
-     * owned operations.
-     */
-    ~Sequence();
-
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param op Object derived from kp::BaseOp that will be recoreded by the
-     * sequence which will be used when the operation is evaluated.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);
-
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> record(
-      std::vector<std::shared_ptr<Tensor>> tensors,
-      TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->record(op);
-    }
-    /**
-     * Record function for operation to be added to the GPU queue in batch. This
-     * template requires classes to be derived from the OpBase class. This
-     * function also requires the Sequence to be recording, otherwise it will
-     * not be able to add the operation.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
-                                     TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->record(op);
-    }
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job synchronously (with a barrier).
-     *
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> eval();
-
-    /**
-     * Resets all the recorded and stored operations, records the operation
-     * provided and submits into the gpu as a submit job synchronously (with a
-     * barrier).
-     *
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::vector<std::shared_ptr<Tensor>> tensors,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(vk::Buffer *primaryBuffer,
-                                   vk::Buffer *stagingBuffer,
-                                   vk::DeviceSize size,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(primaryBuffer, stagingBuffer, size, std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
-                                   TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->eval(op);
-    }
-
-    /**
-     * Eval Async sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait()
-     * must ALWAYS be called after to ensure the sequence is terminated
-     * correctly.
-     *
-     * @return Boolean stating whether execution was successful.
-     */
-    std::shared_ptr<Sequence> evalAsync();
-    /**
-     * Clears currnet operations to record provided one in the vector of
-     * operations into the gpu as a submit job without a barrier. EvalAwait()
-     * must ALWAYS be called after to ensure the sequence is terminated
-     * correctly.
-     *
-     * @return Boolean stating whether execution was successful.
-     */
-    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param tensors Vector of tensors to use for the operation
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> evalAsync(
-      std::vector<std::shared_ptr<Tensor>> tensors,
-      TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(tensors, std::forward<TArgs>(params)...) };
-        return this->evalAsync(op);
-    }
-    /**
-     * Eval sends all the recorded and stored operations in the vector of
-     * operations into the gpu as a submit job with a barrier.
-     *
-     * @param algorithm Algorithm to use for the record often used for OpAlgo
-     * operations
-     * @param TArgs Template parameters that are used to initialise operation
-     * which allows for extensible configurations on initialisation.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    template<typename T, typename... TArgs>
-    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
-                                        TArgs&&... params)
-    {
-        std::shared_ptr<T> op{ new T(algorithm,
-                                     std::forward<TArgs>(params)...) };
-        return this->evalAsync(op);
-    }
-
-    /**
-     * Eval Await waits for the fence to finish processing and then once it
-     * finishes, it runs the postEval of all operations.
-     *
-     * @param waitFor Number of milliseconds to wait before timing out.
-     * @return shared_ptr<Sequence> of the Sequence class itself
-     */
-    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);
-
-    /**
-     * Clear function clears all operations currently recorded and starts
-     * recording again.
-     */
-    void clear();
-
-    /**
-     * Return the timestamps that were latched at the beginning and
-     * after each operation during the last eval() call.
-     */
-    std::vector<std::uint64_t> getTimestamps();
-
-    /**
-     * Begins recording commands for commands to be submitted into the command
-     * buffer.
-     */
-    void begin();
-
-    /**
-     * Ends the recording and stops recording commands when the record command
-     * is sent.
-     */
-    void end();
-
-    /**
-     * Returns true if the sequence is currently in recording activated.
-     *
-     * @return Boolean stating if recording ongoing.
-     */
-    bool isRecording() const;
-
-    /**
-     * Returns true if the sequence has been initialised, and it's based on the
-     * GPU resources being referenced.
-     *
-     * @return Boolean stating if is initialized
-     */
-    bool isInit() const;
-
-    /**
-     * Clears command buffer and triggers re-record of all the current
-     * operations saved, which is useful if the underlying kp::Tensors or
-     * kp::Algorithms are modified and need to be re-recorded.
-     */
-    void rerecord();
-
-    /**
-     * Returns true if the sequence is currently running - mostly used for async
-     * workloads.
-     *
-     * @return Boolean stating if currently running.
-     */
-    bool isRunning() const;
-
-    /**
-     * Destroys and frees the GPU resources which include the buffer and memory
-     * and sets the sequence as init=False.
-     */
-    void destroy();
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
-    std::shared_ptr<vk::Device> mDevice = nullptr;
-    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
-    uint32_t mQueueIndex = -1;
-
-    // -------------- OPTIONALLY OWNED RESOURCES
-    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
-    bool mFreeCommandPool = false;
-    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
-    bool mFreeCommandBuffer = false;
-
-    // -------------- ALWAYS OWNED RESOURCES
-    vk::Fence mFence;
-    std::vector<std::shared_ptr<OpBase>> mOperations{};
-    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;
-
-    // State
-    bool mRecording = false;
-    bool mIsRunning = false;
-
-    // Create functions
-    void createCommandPool();
-    void createCommandBuffer();
-    void createTimestampQueryPool(uint32_t totalTimestamps);
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/Tensor.hpp b/kompute/src/include/kompute/Tensor.hpp
deleted file mode 100644
index 20939093d..000000000
--- a/kompute/src/include/kompute/Tensor.hpp
+++ /dev/null
@@ -1,302 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-#include "logger/Logger.hpp"
-#include <memory>
-#include <string>
-
-namespace kp {
-
-/**
- * Structured data used in GPU operations.
- *
- * Tensors are the base building block in Kompute to perform operations across
- * GPUs. Each tensor would have a respective Vulkan memory and buffer, which
- * would be used to store their respective data. The tensors can be used for GPU
- * data storage or transfer.
- */
-class Tensor
-{
-  public:
-    /**
-     * Type for tensors created: Device allows memory to be transferred from
-     * staging buffers. Staging are host memory visible. Storage are device
-     * visible but are not set up to transfer or receive data (only for shader
-     * storage).
-     */
-    enum class TensorTypes
-    {
-        eDevice = 0,  ///< Type is device memory, source and destination
-        eHost = 1,    ///< Type is host memory, source and destination
-        eStorage = 2, ///< Type is Device memory (only)
-    };
-    enum class TensorDataTypes
-    {
-        eBool = 0,
-        eInt = 1,
-        eUnsignedInt = 2,
-        eFloat = 3,
-        eDouble = 4,
-    };
-
-    static std::string toString(TensorDataTypes dt);
-    static std::string toString(TensorTypes dt);
-
-    /**
-     *  Constructor with data provided which would be used to create the
-     * respective vulkan buffer and memory.
-     *
-     *  @param physicalDevice The physical device to use to fetch properties
-     *  @param device The device to use to create the buffer and memory from
-     *  @param data Non-zero-sized vector of data that will be used by the
-     * tensor
-     *  @param tensorTypes Type for the tensor which is of type TensorTypes
-     */
-    Tensor(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
-           std::shared_ptr<vk::Device> device,
-           void* data,
-           uint32_t elementTotalCount,
-           uint32_t memorySize,
-           const TensorDataTypes& dataType,
-           vk::DeviceMemory *primaryMemory,
-           vk::Buffer *primaryBuffer,
-           vk::DeviceMemory *stagingMemory,
-           vk::Buffer *stagingBuffer,
-           vk::DeviceSize offset,
-           const TensorTypes& tensorType = TensorTypes::eDevice);
-
-    /**
-     * Destructor which is in charge of freeing vulkan resources unless they
-     * have been provided externally.
-     */
-    virtual ~Tensor();
-
-    /**
-     * Function to trigger reinitialisation of the tensor buffer and memory with
-     * new data as well as new potential device type.
-     *
-     * @param data Vector of data to use to initialise vector from
-     * @param tensorType The type to use for the tensor
-     */
-    void rebuild(void* data,
-                 uint32_t elementTotalCount,
-                 uint64_t memorySize,
-                 vk::DeviceMemory *primaryMemory,
-                 vk::Buffer *primaryBuffer,
-                 vk::DeviceMemory *stagingMemory,
-                 vk::Buffer *stagingBuffer,
-                 vk::DeviceSize offset);
-
-    /**
-     * Destroys and frees the GPU resources which include the buffer and memory.
-     */
-    void destroy();
-
-    /**
-     * Check whether tensor is initialized based on the created gpu resources.
-     *
-     * @returns Boolean stating whether tensor is initialized
-     */
-    bool isInit();
-
-    /**
-     * Retrieve the tensor type of the Tensor
-     *
-     * @return Tensor type of tensor
-     */
-    TensorTypes tensorType();
-
-    /**
-     * Records a copy from the memory of the tensor provided to the current
-     * thensor. This is intended to pass memory into a processing, to perform
-     * a staging buffer transfer, or to gather output (between others).
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param copyFromTensor Tensor to copy the data from
-     */
-    void recordCopyFrom(const vk::CommandBuffer& commandBuffer,
-                        std::shared_ptr<Tensor> copyFromTensor);
-
-    void recordFill(const vk::CommandBuffer &commandBuffer,
-                    uint32_t fill);
-
-    /**
-     * Records a copy from the internal staging memory to the device memory
-     * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     */
-    void recordCopyFromStagingToDevice(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records a copy from the internal device memory to the staging memory
-     * using an optional barrier to wait for the operation. This function would
-     * only be relevant for kp::Tensors of type eDevice.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     */
-    void recordCopyFromDeviceToStaging(const vk::CommandBuffer& commandBuffer);
-
-    /**
-     * Records the buffer memory barrier into the primary buffer and command
-     * buffer which ensures that relevant data transfers are carried out
-     * correctly.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param srcAccessMask Access flags for source access mask
-     * @param dstAccessMask Access flags for destination access mask
-     * @param scrStageMask Pipeline stage flags for source stage mask
-     * @param dstStageMask Pipeline stage flags for destination stage mask
-     */
-    void recordPrimaryBufferMemoryBarrier(
-      const vk::CommandBuffer& commandBuffer,
-      vk::AccessFlagBits srcAccessMask,
-      vk::AccessFlagBits dstAccessMask,
-      vk::PipelineStageFlagBits srcStageMask,
-      vk::PipelineStageFlagBits dstStageMask);
-    /**
-     * Records the buffer memory barrier into the staging buffer and command
-     * buffer which ensures that relevant data transfers are carried out
-     * correctly.
-     *
-     * @param commandBuffer Vulkan Command Buffer to record the commands into
-     * @param srcAccessMask Access flags for source access mask
-     * @param dstAccessMask Access flags for destination access mask
-     * @param scrStageMask Pipeline stage flags for source stage mask
-     * @param dstStageMask Pipeline stage flags for destination stage mask
-     */
-    void recordStagingBufferMemoryBarrier(
-      const vk::CommandBuffer& commandBuffer,
-      vk::AccessFlagBits srcAccessMask,
-      vk::AccessFlagBits dstAccessMask,
-      vk::PipelineStageFlagBits srcStageMask,
-      vk::PipelineStageFlagBits dstStageMask);
-
-    /**
-     * Constructs a vulkan descriptor buffer info which can be used to specify
-     * and reference the underlying buffer component of the tensor without
-     * exposing it.
-     *
-     * @return Descriptor buffer info with own buffer
-     */
-    vk::DescriptorBufferInfo constructDescriptorBufferInfo();
-
-    /**
-     * Returns the size/magnitude of the Tensor, which will be the total number
-     * of elements across all dimensions
-     *
-     * @return Unsigned integer representing the total number of elements
-     */
-    uint32_t size();
-
-    /**
-     * Returns the total memory size of the data contained by the Tensor object
-     *
-     * @return Unsigned integer representing the memory of the tensor in bytes.
-     */
-    uint64_t memorySize();
-
-    /**
-     * Retrieve the data type of the tensor (host, device, storage)
-     *
-     * @return Data type of tensor of type kp::Tensor::TensorDataTypes
-     */
-    TensorDataTypes dataType();
-
-    /**
-     * Retrieve the raw data via the pointer to the memory that contains the raw
-     * memory of this current tensor. This tensor gets changed to a nullptr when
-     * the Tensor is removed.
-     *
-     * @return Pointer to raw memory containing raw bytes data of Tensor.
-     */
-    void* rawData();
-
-    /**
-     * Sets / resets the data of the tensor which is directly done on the GPU
-     * host visible memory available by the tensor.
-     */
-    void setRawData(const void* data);
-
-    /**
-     * Template to return the pointer data converted by specific type, which
-     * would be any of the supported types including float, double, int32,
-     * uint32 and bool.
-     *
-     * @return Pointer to raw memory containing raw bytes data of Tensor.
-     */
-    template<typename T>
-    T* data()
-    {
-        return (T*)this->mRawData;
-    }
-
-    /**
-     * Template to get the data of the current tensor as a vector of specific
-     * type, which would be any of the supported types including float, double,
-     * int32, uint32 and bool.
-     *
-     * @return Vector of type provided by template.
-     */
-    template<typename T>
-    std::vector<T> vector()
-    {
-        return { (T*)this->mRawData, ((T*)this->mRawData) + this->size() };
-    }
-
-  protected:
-    // -------------- ALWAYS OWNED RESOURCES
-    TensorTypes mTensorType;
-    TensorDataTypes mDataType;
-    uint32_t mSize = 0;
-    uint64_t mMemorySize = 0;
-    vk::DeviceSize mOffset = 0;
-    void* mRawData = nullptr;
-
-  private:
-    // -------------- NEVER OWNED RESOURCES
-    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice;
-    std::shared_ptr<vk::Device> mDevice;
-    vk::Buffer *mPrimaryBuffer = nullptr;
-    vk::Buffer *mStagingBuffer = nullptr;
-    vk::DeviceMemory *mPrimaryMemory = nullptr;
-    vk::DeviceMemory *mStagingMemory = nullptr;
-
-    void setGPUResources(vk::DeviceMemory *primaryMemory,
-                         vk::Buffer *primaryBuffer,
-                         vk::DeviceMemory *stagingMemory,
-                         vk::Buffer *stagingBuffer,
-                         vk::DeviceSize offset);
-    void recordCopyBuffer(const vk::CommandBuffer& commandBuffer,
-                          vk::Buffer *bufferFrom,
-                          vk::Buffer *bufferTo,
-                          vk::DeviceSize bufferSize,
-                          vk::BufferCopy copyRegion);
-
-    void recordBufferMemoryBarrier(const vk::CommandBuffer& commandBuffer,
-                                   const vk::Buffer& buffer,
-                                   vk::AccessFlagBits srcAccessMask,
-                                   vk::AccessFlagBits dstAccessMask,
-                                   vk::PipelineStageFlagBits srcStageMask,
-                                   vk::PipelineStageFlagBits dstStageMask);
-
-    // Private util functions
-    vk::BufferUsageFlags getPrimaryBufferUsageFlags();
-    vk::MemoryPropertyFlags getPrimaryMemoryPropertyFlags();
-    vk::BufferUsageFlags getStagingBufferUsageFlags();
-    vk::MemoryPropertyFlags getStagingMemoryPropertyFlags();
-};
-
-template<typename T>
-class TensorT : public Tensor
-{
-
-  public:
-    ~TensorT() { KP_LOG_DEBUG("Kompute TensorT destructor"); }
-
-    TensorDataTypes dataType();
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/logger/Logger.hpp b/kompute/src/include/kompute/logger/Logger.hpp
deleted file mode 100644
index f97e95cf0..000000000
--- a/kompute/src/include/kompute/logger/Logger.hpp
+++ /dev/null
@@ -1,197 +0,0 @@
-#pragma once
-
-#define KOMPUTE_LOG_LEVEL_TRACE 0
-#define KOMPUTE_LOG_LEVEL_DEBUG 1
-#define KOMPUTE_LOG_LEVEL_INFO 2
-#define KOMPUTE_LOG_LEVEL_WARN 3
-#define KOMPUTE_LOG_LEVEL_ERROR 4
-#define KOMPUTE_LOG_LEVEL_CRITICAL 5
-#define KOMPUTE_LOG_LEVEL_OFF 6
-
-// Logging is disabled entirely.
-#if KOMPUTE_OPT_LOG_LEVEL_DISABLED
-#define KP_LOG_TRACE(...)
-#define KP_LOG_DEBUG(...)
-#define KP_LOG_INFO(...)
-#define KP_LOG_WARN(...)
-#define KP_LOG_ERROR(...)
-#else
-
-#if !KOMPUTE_OPT_USE_SPDLOG
-#if VK_USE_PLATFORM_ANDROID_KHR
-#include <android/log.h>
-#include <fmt/core.h>
-static const char* KOMPUTE_LOG_TAG = "KomputeLog";
-#else
-#if KOMPUTE_BUILD_PYTHON
-#include <pybind11/pybind11.h>
-namespace py = pybind11;
-// from python/src/main.cpp
-extern py::object kp_trace, kp_debug, kp_info, kp_warning, kp_error;
-#else
-#include <fmt/core.h>
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#include <spdlog/spdlog.h>
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-#include <set>
-#include <string>
-#include <vector>
-namespace logger {
-// Setup the logger, note the loglevel can not be set below the CMake log level
-// (To change this use -DKOMPUTE_OPT_LOG_LEVEL=...)
-void
-setupLogger();
-
-// Logging is enabled, but we do not use Spdlog. So we use fmt in case nothing
-// else is defined, overriding logging.
-#if !KOMPUTE_OPT_USE_SPDLOG
-
-#ifndef KP_LOG_TRACE
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_TRACE
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_TRACE(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_VERBOSE, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_trace(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_TRACE(...)                                                      \
-    fmt::print("[{} {}] [trace] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_TRACE(...)
-#endif
-#endif // !KP_LOG_TRACE
-
-#ifndef KP_LOG_DEBUG
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_DEBUG
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_DEBUG(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_DEBUG, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_debug(fmt::format(__VA_ARGS__))
-#else
-#ifdef __FILE_NAME__ // gcc 12 provides only file name without path
-#define KP_LOG_DEBUG(...)                                                      \
-    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE_NAME__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_DEBUG(...)                                                      \
-    fmt::print("[{} {}] [debug] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // __FILE__NAME__
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_DEBUG(...)
-#endif
-#endif // !KP_LOG_DEBUG
-
-#ifndef KP_LOG_INFO
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_INFO
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_INFO(...)                                                       \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_INFO, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_info(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_INFO(...)                                                       \
-    fmt::print("[{} {}] [info] [{}:{}] {}\n",                                  \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_INFO(...)
-#endif
-#endif // !KP_LOG_INFO
-
-#ifndef KP_LOG_WARN
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_WARN
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_WARN(...)                                                       \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_WARN, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_warning(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_WARN(...)                                                       \
-    fmt::print("[{} {}] [warn] [{}:{}] {}\n",                                  \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_WARN(...)
-#endif
-#endif // !KP_LOG_WARN
-
-#ifndef KP_LOG_ERROR
-#if KOMPUTE_OPT_ACTIVE_LOG_LEVEL <= KOMPUTE_LOG_LEVEL_ERROR
-#if VK_USE_PLATFORM_ANDROID_KHR
-#define KP_LOG_ERROR(...)                                                      \
-    ((void)__android_log_write(                                                \
-      ANDROID_LOG_ERROR, KOMPUTE_LOG_TAG, fmt::format(__VA_ARGS__).c_str()))
-#else
-#if KOMPUTE_BUILD_PYTHON
-#define KP_LOG_DEBUG(...) kp_error(fmt::format(__VA_ARGS__))
-#else
-#define KP_LOG_ERROR(...)                                                      \
-    fmt::print("[{} {}] [error] [{}:{}] {}\n",                                 \
-               __DATE__,                                                       \
-               __TIME__,                                                       \
-               __FILE__,                                                       \
-               __LINE__,                                                       \
-               fmt::format(__VA_ARGS__))
-#endif // KOMPUTE_BUILD_PYTHON
-#endif // VK_USE_PLATFORM_ANDROID_KHR
-#else
-#define KP_LOG_ERROR(...)
-#endif
-#endif // !KP_LOG_ERROR
-#else
-
-#define KP_LOG_TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
-#define KP_LOG_DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
-#define KP_LOG_INFO(...) SPDLOG_INFO(__VA_ARGS__)
-#define KP_LOG_WARN(...) SPDLOG_WARN(__VA_ARGS__)
-#define KP_LOG_ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
-
-void
-setLogLevel(spdlog::level::level_enum level);
-
-spdlog::level::level_enum
-getLogLevel();
-
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-} // namespace logger
-
-#endif // KOMPUTE_OPT_LOG_LEVEL_DISABLED
diff --git a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp b/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
deleted file mode 100644
index e91598f05..000000000
--- a/kompute/src/include/kompute/operations/OpAlgoDispatch.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that provides a general abstraction that simplifies the use of
- * algorithm and parameter components which can be used with shaders.
- * By default it enables the user to provide a dynamic number of tensors
- * which are then passed as inputs.
- */
-class OpAlgoDispatch : public OpBase
-{
-  public:
-    /**
-     * Constructor that stores the algorithm to use as well as the relevant
-     * push constants to override when recording.
-     *
-     * @param algorithm The algorithm object to use for dispatch
-     * @param pushConstants The push constants to use for override
-     */
-    template<typename T = float>
-    OpAlgoDispatch(const std::shared_ptr<kp::Algorithm>& algorithm,
-                   const std::vector<T>& pushConstants = {})
-    {
-        KP_LOG_DEBUG("Kompute OpAlgoDispatch constructor");
-
-        this->mAlgorithm = algorithm;
-
-        if (pushConstants.size()) {
-            uint32_t memorySize = sizeof(decltype(pushConstants.back()));
-            uint32_t size = pushConstants.size();
-            uint32_t totalSize = size * memorySize;
-            this->mPushConstantsData = malloc(totalSize);
-            memcpy(this->mPushConstantsData, pushConstants.data(), totalSize);
-            this->mPushConstantsDataTypeMemorySize = memorySize;
-            this->mPushConstantsSize = size;
-        }
-    }
-
-    /**
-     * Default destructor, which is in charge of destroying the algorithm
-     * components but does not destroy the underlying tensors
-     */
-    virtual ~OpAlgoDispatch() override;
-
-    /**
-     * This records the commands that are to be sent to the GPU. This includes
-     * the barriers that ensure the memory has been copied before going in and
-     * out of the shader, as well as the dispatch operation that sends the
-     * shader processing to the gpu. This function also records the GPU memory
-     * copy of the output data for the staging buffer so it can be read by the
-     * host.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::shared_ptr<Algorithm> mAlgorithm;
-    void* mPushConstantsData = nullptr;
-    uint32_t mPushConstantsDataTypeMemorySize = 0;
-    uint32_t mPushConstantsSize = 0;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBase.hpp b/kompute/src/include/kompute/operations/OpBase.hpp
deleted file mode 100644
index 737670846..000000000
--- a/kompute/src/include/kompute/operations/OpBase.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-
-namespace kp {
-
-/**
- *  Base Operation which provides the high level interface that Kompute
- *  operations implement in order to perform a set of actions in the GPU.
- *
- *  Operations can perform actions on tensors, and optionally can also own an
- *  Algorithm with respective parameters. kp::Operations with kp::Algorithms
- *  would inherit from kp::OpBaseAlgo.
- */
-class OpBase
-{
-  public:
-    /**
-     * Default destructor for OpBase class. This OpBase destructor class should
-     * always be called to destroy and free owned resources unless it is
-     * intended to destroy the resources in the parent class.
-     */
-    virtual ~OpBase() { KP_LOG_DEBUG("Kompute OpBase destructor started"); }
-
-    /**
-     * The record function is intended to only send a record command or run
-     * commands that are expected to record operations that are to be submitted
-     * as a batch into the GPU.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) = 0;
-
-    /**
-     * Pre eval is called before the Sequence has called eval and submitted the
-     * commands to the GPU for processing, and can be used to perform any
-     * per-eval setup steps required as the computation iteration begins. It's
-     * worth noting that there are situations where eval can be called multiple
-     * times, so the resources that are created should be idempotent in case
-     * it's called multiple times in a row.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) = 0;
-
-    /**
-     * Post eval is called after the Sequence has called eval and submitted the
-     * commands to the GPU for processing, and can be used to perform any
-     * tear-down steps required as the computation iteration finishes. It's
-     * worth noting that there are situations where eval can be called multiple
-     * times, so the resources that are destroyed should not require a re-init
-     * unless explicitly provided by the user.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) = 0;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp b/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
deleted file mode 100644
index 50d8e9707..000000000
--- a/kompute/src/include/kompute/operations/OpBufferSyncDevice.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-class OpBufferSyncDevice : public OpBase
-{
-  public:
-    OpBufferSyncDevice(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpBufferSyncDevice() override;
-
-    /**
-     * For device buffers, it records the copy command for the buffer to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp b/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
deleted file mode 100644
index 7db997199..000000000
--- a/kompute/src/include/kompute/operations/OpBufferSyncLocal.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-class OpBufferSyncLocal : public OpBase
-{
-  public:
-    OpBufferSyncLocal(
-        vk::Buffer *primaryBuffer,
-        vk::Buffer *stagingBuffer,
-        vk::DeviceSize size);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpBufferSyncLocal() override;
-
-    /**
-     * For device buffers, it records the copy command for the buffer to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp b/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
deleted file mode 100644
index 4a2322323..000000000
--- a/kompute/src/include/kompute/operations/OpMemoryBarrier.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that provides a general abstraction that simplifies the use of
- * algorithm and parameter components which can be used with shaders.
- * It exposes the pipeline barrier functionality specifically for memory
- * barriers that can be configured through the respective source and destination
- * masks
- */
-class OpMemoryBarrier : public OpBase
-{
-  public:
-    /**
-     * Constructor that stores tensors as well as memory barrier parameters to
-     * be used to create a pipeline barrier on the respective primary or staging
-     * tensor.
-     *
-     * @param tensors The tensors to apply the memory barriers on
-     * @param srcAccessMask The kp::AccessFlagBits for the source access mask
-     * @param dstAccessMask The kp::AccessFlagBits for the destination access
-     * mask
-     * @param srcStageMask The kp::PipelineStageFlagBits for the source stage
-     * mask
-     * @param dstStageMask The kp::PipelineStageFlagBits for the destination
-     * stage mask
-     * @param barrierOnPrimary Boolean to select primary or secondary buffers on
-     * tensors
-     */
-    OpMemoryBarrier(const std::vector<std::shared_ptr<Tensor>>& tensors,
-                    const vk::AccessFlagBits& srcAccessMask,
-                    const vk::AccessFlagBits& dstAccessMask,
-                    const vk::PipelineStageFlagBits& srcStageMask,
-                    const vk::PipelineStageFlagBits& dstStageMask,
-                    bool barrierOnPrimary = true);
-
-    /**
-     * Default destructor, which is in charge of destroying the reference to the
-     * tensors and all the relevant access / stage masks created
-     */
-    virtual ~OpMemoryBarrier() override;
-
-    /**
-     * This records the memory barrier with the access and stage masks provided
-     * across all relevant tensors.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    const vk::AccessFlagBits mSrcAccessMask;
-    const vk::AccessFlagBits mDstAccessMask;
-    const vk::PipelineStageFlagBits mSrcStageMask;
-    const vk::PipelineStageFlagBits mDstStageMask;
-    const bool mBarrierOnPrimary;
-    const std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpMult.hpp b/kompute/src/include/kompute/operations/OpMult.hpp
deleted file mode 100644
index f75ccc4fb..000000000
--- a/kompute/src/include/kompute/operations/OpMult.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <fstream>
-
-#include "kompute/Core.hpp"
-
-#include "ShaderOpMult.hpp"
-
-#include "kompute/Algorithm.hpp"
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpAlgoDispatch.hpp"
-
-namespace kp {
-
-/**
- * Operation that performs multiplication on two tensors and outpus on third
- * tensor.
- */
-class OpMult : public OpAlgoDispatch
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the bare minimum
-     * requirements for the operations to be able to create and manage their
-     * sub-components.
-     *
-     * @param tensors Tensors that are to be used in this operation
-     * @param algorithm An algorithm that will be overridden with the OpMult
-     * shader data and the tensors provided which are expected to be 3
-     */
-    OpMult(std::vector<std::shared_ptr<Tensor>> tensors,
-           std::shared_ptr<Algorithm> algorithm)
-      : OpAlgoDispatch(algorithm)
-    {
-        KP_LOG_DEBUG("Kompute OpMult constructor with params");
-
-        if (tensors.size() != 3) {
-            throw std::runtime_error(
-              "Kompute OpMult expected 3 tensors but got " +
-              std::to_string(tensors.size()));
-        }
-
-        const std::vector<uint32_t> spirv = std::vector<uint32_t>(
-          SHADEROPMULT_COMP_SPV.begin(), SHADEROPMULT_COMP_SPV.end());
-
-        algorithm->rebuild<>(tensors, spirv);
-    }
-
-    /**
-     * Default destructor, which is in charge of destroying the algorithm
-     * components but does not destroy the underlying tensors
-     */
-    ~OpMult() override { KP_LOG_DEBUG("Kompute OpMult destructor started"); }
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorCopy.hpp b/kompute/src/include/kompute/operations/OpTensorCopy.hpp
deleted file mode 100644
index 968c1065a..000000000
--- a/kompute/src/include/kompute/operations/OpTensorCopy.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that copies the data from the first tensor to the rest of the
- * tensors provided, using a record command for all the vectors. This operation
- * does not own/manage the memory of the tensors passed to it. The operation
- * must only receive tensors of type
- */
-class OpTensorCopy : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorCopy(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorCopy() override;
-
-    /**
-     * Records the copy commands from the first tensor into all the other
-     * tensors provided. Also optionally records a barrier.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Copies the local vectors for all the tensors to sync the data with the
-     * gpu.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorFill.hpp b/kompute/src/include/kompute/operations/OpTensorFill.hpp
deleted file mode 100644
index 9a6bf131e..000000000
--- a/kompute/src/include/kompute/operations/OpTensorFill.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that fills the tensor
- */
-class OpTensorFill : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorFill(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorFill() override;
-
-    /**
-     * Records the fill command for tensor.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp b/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
deleted file mode 100644
index 9b39e490f..000000000
--- a/kompute/src/include/kompute/operations/OpTensorSyncDevice.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-#include "kompute/Tensor.hpp"
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that syncs tensor's device by mapping local data into the device
- * memory. For TensorTypes::eDevice it will use a record operation for the
- * memory to be syncd into GPU memory which means that the operation will be
- * done in sync with GPU commands. For TensorTypes::eHost it will only map the
- * data into host memory which will happen during preEval before the recorded
- * commands are dispatched.
- */
-class OpTensorSyncDevice : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation. The tensos
-     * provided cannot be of type TensorTypes::eStorage.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorSyncDevice(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorSyncDevice() override;
-
-    /**
-     * For device tensors, it records the copy command for the tensor to copy
-     * the data from its staging to device memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any postEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-    vk::Buffer *mPrimaryBuffer;
-    vk::Buffer *mStagingBuffer;
-    vk::DeviceSize mSize;
-};
-
-} // End namespace kp
diff --git a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp b/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
deleted file mode 100644
index 4216003e5..000000000
--- a/kompute/src/include/kompute/operations/OpTensorSyncLocal.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "kompute/Core.hpp"
-
-#include "kompute/Tensor.hpp"
-
-#include "kompute/operations/OpBase.hpp"
-
-namespace kp {
-
-/**
- * Operation that syncs tensor's local memory by mapping device data into the
- * local CPU memory. For TensorTypes::eDevice it will use a record operation
- * for the memory to be syncd into GPU memory which means that the operation
- * will be done in sync with GPU commands. For TensorTypes::eHost it will
- * only map the data into host memory which will happen during preEval before
- * the recorded commands are dispatched.
- */
-class OpTensorSyncLocal : public OpBase
-{
-  public:
-    /**
-     * Default constructor with parameters that provides the core vulkan
-     * resources and the tensors that will be used in the operation. The tensors
-     * provided cannot be of type TensorTypes::eStorage.
-     *
-     * @param tensors Tensors that will be used to create in operation.
-     */
-    OpTensorSyncLocal(const std::vector<std::shared_ptr<Tensor>>& tensors);
-
-    /**
-     * Default destructor. This class does not manage memory so it won't be
-     * expecting the parent to perform a release.
-     */
-    ~OpTensorSyncLocal() override;
-
-    /**
-     * For device tensors, it records the copy command for the tensor to copy
-     * the data from its device to staging memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    void record(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * Does not perform any preEval commands.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void preEval(const vk::CommandBuffer& commandBuffer) override;
-
-    /**
-     * For host tensors it performs the map command from the host memory into
-     * local memory.
-     *
-     * @param commandBuffer The command buffer to record the command into.
-     */
-    virtual void postEval(const vk::CommandBuffer& commandBuffer) override;
-
-  private:
-    // -------------- ALWAYS OWNED RESOURCES
-    std::vector<std::shared_ptr<Tensor>> mTensors;
-};
-
-} // End namespace kp
diff --git a/kompute/src/logger/CMakeLists.txt b/kompute/src/logger/CMakeLists.txt
deleted file mode 100644
index 1f8695acd..000000000
--- a/kompute/src/logger/CMakeLists.txt
+++ /dev/null
@@ -1,69 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-set(LOGGER_SOURCES Logger.cpp)
-
-add_library(kp_logger STATIC ${LOGGER_SOURCES})
-
-# Define log levels in code
-add_compile_definitions(KOMPUTE_LOG_LEVEL_TRACE=0)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_DEBUG=1)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_INFO=2)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_WARN=3)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_ERROR=4)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_CRITICAL=5)
-add_compile_definitions(KOMPUTE_LOG_LEVEL_OFF=6)
-
-if(KOMPUTE_OPT_BUILD_PYTHON AND KOMPUTE_OPT_USE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_PYTHON' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
-endif()
-
-if(KOMPUTE_OPT_ANDROID_BUILD AND KOMPUTE_OPT_USE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_ANDROID_BUILD' is incompatible with 'KOMPUTE_OPT_USE_SPDLOG'. To continue set either one option to 'OFF'.")
-endif()
-
-if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Trace")
-    set(KOMPUTE_OPT_LOG_LEVEL TRACE)
-    message(STATUS "Using log level Trace")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Debug")
-    set(KOMPUTE_OPT_LOG_LEVEL DEBUG)
-    message(STATUS "Using log level Debug")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Info")
-    set(KOMPUTE_OPT_LOG_LEVEL INFO)
-    message(STATUS "Using log level Info")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Warn")
-    set(KOMPUTE_OPT_LOG_LEVEL WARN)
-    message(STATUS "Using log level Warn")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Error")
-    set(KOMPUTE_OPT_LOG_LEVEL ERROR)
-    message(STATUS "Using log level Error")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Critical")
-    set(KOMPUTE_OPT_LOG_LEVEL CRITICAL)
-    message(STATUS "Using log level Critical")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
-    set(KOMPUTE_OPT_LOG_LEVEL OFF)
-    message(STATUS "Using log level Off")
-elseif(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Default")
-    set(KOMPUTE_OPT_LOG_LEVEL $<IF:$<CONFIG:Debug>,DEBUG,INFO>)
-    message(STATUS "Setting KOMPUTE_OPT_LOG_LEVEL to according to the build type")
-else()
-    message(FATAL_ERROR "Log level '${KOMPUTE_OPT_LOG_LEVEL}' unknown, use -DKOMPUTE_OPT_LOG_LEVEL={Trace, Debug, Info, Warn, Error, Critical, Off, Default} to set it to a correct value.")
-endif()
-
-# Always make sure we define the Kompute log level independent of the Spdlog log level
-target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMPUTE_LOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-
-# Link depending on how the logger should be setup
-if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
-    if(KOMPUTE_OPT_USE_SPDLOG)
-        target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
-        target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-        target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
-        message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
-
-        if(KOMPUTE_OPT_SPDLOG_ASYNC_MODE)
-            target_compile_definitions(kp_logger INTERFACE KOMPUTE_SPDLOG_ASYNC_LOGGING=1)
-        endif()
-    else()
-        target_link_libraries(kp_logger PUBLIC fmt::fmt)
-    endif()
-endif()
diff --git a/kompute/src/logger/Logger.cpp b/kompute/src/logger/Logger.cpp
deleted file mode 100644
index 69df2b609..000000000
--- a/kompute/src/logger/Logger.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "kompute/logger/Logger.hpp"
-
-#if !KOMPUTE_OPT_LOG_LEVEL_DISABLED
-#if !KOMPUTE_OPT_USE_SPDLOG
-#else
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <spdlog/async.h>
-#include <spdlog/common.h>
-#include <spdlog/logger.h>
-#include <spdlog/sinks/stdout_color_sinks.h>
-#include <spdlog/spdlog.h>
-#include <string>
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-
-namespace logger {
-#if !KOMPUTE_OPT_USE_SPDLOG
-
-void
-setupLogger()
-{
-}
-
-#else
-constexpr int THREAD_QUEUE_LENGTH = 8192;
-
-void
-setupLogger()
-{
-    // Ensure we setup the logger only once
-    static bool setup = false;
-    static std::mutex setupMutex{};
-    setupMutex.lock();
-    if (setup) {
-        setupMutex.unlock();
-        return;
-    }
-    setup = true;
-    setupMutex.unlock();
-
-    spdlog::init_thread_pool(THREAD_QUEUE_LENGTH, 1);
-    spdlog::sink_ptr console_sink =
-      std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
-#if SPDLOG_ACTIVE_LEVEL < SPDLOG_LEVEL_INFO
-    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=21s] %v");
-#else
-    console_sink->set_pattern("[%H:%M:%S %z] [%^%=9l%$] [%=15s] %v");
-#endif
-    std::vector<spdlog::sink_ptr> sinks{ console_sink };
-    // TODO: Add flag in compile flags
-    std::shared_ptr<spdlog::logger> logger =
-#if KOMPUTE_SPDLOG_ASYNC_LOGGING
-          std::make_shared<spdlog::async_logger>(
-            "",
-            sinks.begin(),
-            sinks.end(),
-            spdlog::thread_pool(),
-            spdlog::async_overflow_policy::block);
-#else
-          std::make_shared<spdlog::logger>(
-            "",
-            sinks.begin(),
-            sinks.end());
-#endif
-
-    logger->set_level(getLogLevel());
-
-    spdlog::set_default_logger(logger);
-}
-
-spdlog::level::level_enum
-getLogLevel()
-{
-#if SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_TRACE
-    return spdlog::level::trace;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_DEBUG
-    return spdlog::level::debug;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_INFO
-    return spdlog::level::info;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_WARN
-    return spdlog::level::warn;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_ERROR
-    return spdlog::level::error;
-#elif SPDLOG_ACTIVE_LEVEL == SPDLOG_LEVEL_CRITICAL
-    return spdlog::level::critical;
-#else
-    return spdlog::level::off;
-#endif
-}
-
-void
-setLogLevel(const spdlog::level::level_enum level)
-{
-    spdlog::default_logger()->set_level(level);
-}
-#endif // !KOMPUTE_OPT_USE_SPDLOG
-} // namespace logger
-
-#endif
diff --git a/kompute/src/shaders/CMakeLists.txt b/kompute/src/shaders/CMakeLists.txt
deleted file mode 100644
index 901bf3e8a..000000000
--- a/kompute/src/shaders/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# ######################
-cmake_minimum_required(VERSION 3.20)
-
-add_subdirectory(glsl)
\ No newline at end of file
diff --git a/kompute/src/shaders/glsl/CMakeLists.txt b/kompute/src/shaders/glsl/CMakeLists.txt
deleted file mode 100644
index 3101a2b17..000000000
--- a/kompute/src/shaders/glsl/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# ######################
-cmake_minimum_required(VERSION 3.20)
-
-# Check if build shaders from source is enabled
-if(KOMPUTE_OPT_BUILD_SHADERS)
-    vulkan_compile_shader(INFILE ShaderOpMult.comp
-        OUTFILE ShaderOpMult.hpp
-        NAMESPACE "kp")
-
-    vulkan_compile_shader(INFILE ShaderLogisticRegression.comp
-        OUTFILE ShaderLogisticRegression.hpp
-        NAMESPACE "kp")
-else() # Else we will use our precompiled versions
-    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderOpMult.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp)
-    add_custom_command(OUTPUT $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/ShaderLogisticRegression.hpp.in $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp)
-endif()
-
-add_library(kp_shader INTERFACE "${CMAKE_CURRENT_BINARY_DIR}/ShaderOpMult.hpp"
-    "${CMAKE_CURRENT_BINARY_DIR}/ShaderLogisticRegression.hpp")
-
-target_include_directories(kp_shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>)
-
-# Make sure we install shaders:
-install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderOpMult.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-install(FILES $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>/ShaderLogisticRegression.hpp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp b/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
deleted file mode 100644
index 5a1c5d948..000000000
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.comp
+++ /dev/null
@@ -1,52 +0,0 @@
-#version 450
-
-layout (constant_id = 0) const float m = 0;
-
-layout (local_size_x = 1) in;
-
-layout(set = 0, binding = 0) buffer bxi { float xi[]; };
-layout(set = 0, binding = 1) buffer bxj { float xj[]; };
-layout(set = 0, binding = 2) buffer by { float y[]; };
-layout(set = 0, binding = 3) buffer bwin { float win[]; };
-layout(set = 0, binding = 4) buffer bwouti { float wouti[]; };
-layout(set = 0, binding = 5) buffer bwoutj { float woutj[]; };
-layout(set = 0, binding = 6) buffer bbin { float bin[]; };
-layout(set = 0, binding = 7) buffer bbout { float bout[]; };
-layout(set = 0, binding = 8) buffer blout { float lout[]; };
-
-float sigmoid(float z) {
-    return 1.0 / (1.0 + exp(-z));
-}
-
-float inference(vec2 x, vec2 w, float b) {
-    // Compute the linear mapping function
-    float z = dot(w, x) + b;
-    // Calculate the y-hat with sigmoid
-    float yHat = sigmoid(z);
-    return yHat;
-}
-
-float calculateLoss(float yHat, float y) {
-    return -(y * log(yHat)  +  (1.0 - y) * log(1.0 - yHat));
-}
-
-void main() {
-    uint idx = gl_GlobalInvocationID.x;
-
-    vec2 wCurr = vec2(win[0], win[1]);
-    float bCurr = bin[0];
-
-    vec2 xCurr = vec2(xi[idx], xj[idx]);
-    float yCurr = y[idx];
-
-    float yHat = inference(xCurr, wCurr, bCurr);
-
-    float dZ = yHat - yCurr;
-    vec2 dW = (1. / m) * xCurr * dZ;
-    float dB = (1. / m) * dZ;
-    wouti[idx] = dW.x;
-    woutj[idx] = dW.y;
-    bout[idx] = dB;
-
-    lout[idx] = calculateLoss(yHat, yCurr);
-}
diff --git a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in b/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
deleted file mode 100644
index bfe7792c6..000000000
--- a/kompute/src/shaders/glsl/ShaderLogisticRegression.hpp.in
+++ /dev/null
@@ -1,310 +0,0 @@
-#pragma once
-#include <array>
-#include <cstdint>
-
-namespace kp {
-const std::array<uint32_t, 1204> SHADERLOGISTICREGRESSION_COMP_SPV = { 
-0x07230203, 0x00010000, 0x0008000a, 0x000000ae, 
-0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
-0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
-0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
-0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
-0x00000000, 0x00000041, 0x00060010, 0x00000004, 
-0x00000011, 0x00000001, 0x00000001, 0x00000001, 
-0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
-0x00000004, 0x6e69616d, 0x00000000, 0x00050005, 
-0x0000000a, 0x6d676973, 0x2864696f, 0x003b3166, 
-0x00030005, 0x00000009, 0x0000007a, 0x00080005, 
-0x00000012, 0x65666e69, 0x636e6572, 0x66762865, 
-0x66763b32, 0x31663b32, 0x0000003b, 0x00030005, 
-0x0000000f, 0x00000078, 0x00030005, 0x00000010, 
-0x00000077, 0x00030005, 0x00000011, 0x00000062, 
-0x00080005, 0x00000017, 0x636c6163, 0x74616c75, 
-0x736f4c65, 0x31662873, 0x3b31663b, 0x00000000, 
-0x00040005, 0x00000015, 0x74614879, 0x00000000, 
-0x00030005, 0x00000016, 0x00000079, 0x00030005, 
-0x00000021, 0x0000007a, 0x00040005, 0x00000027, 
-0x74614879, 0x00000000, 0x00040005, 0x00000028, 
-0x61726170, 0x0000006d, 0x00030005, 0x0000003e, 
-0x00786469, 0x00080005, 0x00000041, 0x475f6c67, 
-0x61626f6c, 0x766e496c, 0x7461636f, 0x496e6f69, 
-0x00000044, 0x00040005, 0x00000046, 0x72754377, 
-0x00000072, 0x00040005, 0x00000048, 0x6e697762, 
-0x00000000, 0x00040006, 0x00000048, 0x00000000, 
-0x006e6977, 0x00030005, 0x0000004a, 0x00000000, 
-0x00040005, 0x00000054, 0x72754362, 0x00000072, 
-0x00040005, 0x00000056, 0x6e696262, 0x00000000, 
-0x00040006, 0x00000056, 0x00000000, 0x006e6962, 
-0x00030005, 0x00000058, 0x00000000, 0x00040005, 
-0x0000005b, 0x72754378, 0x00000072, 0x00030005, 
-0x0000005d, 0x00697862, 0x00040006, 0x0000005d, 
-0x00000000, 0x00006978, 0x00030005, 0x0000005f, 
-0x00000000, 0x00030005, 0x00000064, 0x006a7862, 
-0x00040006, 0x00000064, 0x00000000, 0x00006a78, 
-0x00030005, 0x00000066, 0x00000000, 0x00040005, 
-0x0000006b, 0x72754379, 0x00000072, 0x00030005, 
-0x0000006d, 0x00007962, 0x00040006, 0x0000006d, 
-0x00000000, 0x00000079, 0x00030005, 0x0000006f, 
-0x00000000, 0x00040005, 0x00000073, 0x74614879, 
-0x00000000, 0x00040005, 0x00000074, 0x61726170, 
-0x0000006d, 0x00040005, 0x00000076, 0x61726170, 
-0x0000006d, 0x00040005, 0x00000078, 0x61726170, 
-0x0000006d, 0x00030005, 0x0000007b, 0x00005a64, 
-0x00030005, 0x0000007f, 0x00005764, 0x00030005, 
-0x00000080, 0x0000006d, 0x00030005, 0x00000086, 
-0x00004264, 0x00040005, 0x0000008b, 0x756f7762, 
-0x00006974, 0x00050006, 0x0000008b, 0x00000000, 
-0x74756f77, 0x00000069, 0x00030005, 0x0000008d, 
-0x00000000, 0x00040005, 0x00000093, 0x756f7762, 
-0x00006a74, 0x00050006, 0x00000093, 0x00000000, 
-0x74756f77, 0x0000006a, 0x00030005, 0x00000095, 
-0x00000000, 0x00040005, 0x0000009c, 0x756f6262, 
-0x00000074, 0x00050006, 0x0000009c, 0x00000000, 
-0x74756f62, 0x00000000, 0x00030005, 0x0000009e, 
-0x00000000, 0x00040005, 0x000000a3, 0x756f6c62, 
-0x00000074, 0x00050006, 0x000000a3, 0x00000000, 
-0x74756f6c, 0x00000000, 0x00030005, 0x000000a5, 
-0x00000000, 0x00040005, 0x000000a7, 0x61726170, 
-0x0000006d, 0x00040005, 0x000000a9, 0x61726170, 
-0x0000006d, 0x00040047, 0x00000041, 0x0000000b, 
-0x0000001c, 0x00040047, 0x00000047, 0x00000006, 
-0x00000004, 0x00050048, 0x00000048, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000048, 
-0x00000003, 0x00040047, 0x0000004a, 0x00000022, 
-0x00000000, 0x00040047, 0x0000004a, 0x00000021, 
-0x00000003, 0x00040047, 0x00000055, 0x00000006, 
-0x00000004, 0x00050048, 0x00000056, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000056, 
-0x00000003, 0x00040047, 0x00000058, 0x00000022, 
-0x00000000, 0x00040047, 0x00000058, 0x00000021, 
-0x00000006, 0x00040047, 0x0000005c, 0x00000006, 
-0x00000004, 0x00050048, 0x0000005d, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000005d, 
-0x00000003, 0x00040047, 0x0000005f, 0x00000022, 
-0x00000000, 0x00040047, 0x0000005f, 0x00000021, 
-0x00000000, 0x00040047, 0x00000063, 0x00000006, 
-0x00000004, 0x00050048, 0x00000064, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000064, 
-0x00000003, 0x00040047, 0x00000066, 0x00000022, 
-0x00000000, 0x00040047, 0x00000066, 0x00000021, 
-0x00000001, 0x00040047, 0x0000006c, 0x00000006, 
-0x00000004, 0x00050048, 0x0000006d, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000006d, 
-0x00000003, 0x00040047, 0x0000006f, 0x00000022, 
-0x00000000, 0x00040047, 0x0000006f, 0x00000021, 
-0x00000002, 0x00040047, 0x00000080, 0x00000001, 
-0x00000000, 0x00040047, 0x0000008a, 0x00000006, 
-0x00000004, 0x00050048, 0x0000008b, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000008b, 
-0x00000003, 0x00040047, 0x0000008d, 0x00000022, 
-0x00000000, 0x00040047, 0x0000008d, 0x00000021, 
-0x00000004, 0x00040047, 0x00000092, 0x00000006, 
-0x00000004, 0x00050048, 0x00000093, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x00000093, 
-0x00000003, 0x00040047, 0x00000095, 0x00000022, 
-0x00000000, 0x00040047, 0x00000095, 0x00000021, 
-0x00000005, 0x00040047, 0x0000009b, 0x00000006, 
-0x00000004, 0x00050048, 0x0000009c, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x0000009c, 
-0x00000003, 0x00040047, 0x0000009e, 0x00000022, 
-0x00000000, 0x00040047, 0x0000009e, 0x00000021, 
-0x00000007, 0x00040047, 0x000000a2, 0x00000006, 
-0x00000004, 0x00050048, 0x000000a3, 0x00000000, 
-0x00000023, 0x00000000, 0x00030047, 0x000000a3, 
-0x00000003, 0x00040047, 0x000000a5, 0x00000022, 
-0x00000000, 0x00040047, 0x000000a5, 0x00000021, 
-0x00000008, 0x00040047, 0x000000ad, 0x0000000b, 
-0x00000019, 0x00020013, 0x00000002, 0x00030021, 
-0x00000003, 0x00000002, 0x00030016, 0x00000006, 
-0x00000020, 0x00040020, 0x00000007, 0x00000007, 
-0x00000006, 0x00040021, 0x00000008, 0x00000006, 
-0x00000007, 0x00040017, 0x0000000c, 0x00000006, 
-0x00000002, 0x00040020, 0x0000000d, 0x00000007, 
-0x0000000c, 0x00060021, 0x0000000e, 0x00000006, 
-0x0000000d, 0x0000000d, 0x00000007, 0x00050021, 
-0x00000014, 0x00000006, 0x00000007, 0x00000007, 
-0x0004002b, 0x00000006, 0x00000019, 0x3f800000, 
-0x00040015, 0x0000003c, 0x00000020, 0x00000000, 
-0x00040020, 0x0000003d, 0x00000007, 0x0000003c, 
-0x00040017, 0x0000003f, 0x0000003c, 0x00000003, 
-0x00040020, 0x00000040, 0x00000001, 0x0000003f, 
-0x0004003b, 0x00000040, 0x00000041, 0x00000001, 
-0x0004002b, 0x0000003c, 0x00000042, 0x00000000, 
-0x00040020, 0x00000043, 0x00000001, 0x0000003c, 
-0x0003001d, 0x00000047, 0x00000006, 0x0003001e, 
-0x00000048, 0x00000047, 0x00040020, 0x00000049, 
-0x00000002, 0x00000048, 0x0004003b, 0x00000049, 
-0x0000004a, 0x00000002, 0x00040015, 0x0000004b, 
-0x00000020, 0x00000001, 0x0004002b, 0x0000004b, 
-0x0000004c, 0x00000000, 0x00040020, 0x0000004d, 
-0x00000002, 0x00000006, 0x0004002b, 0x0000004b, 
-0x00000050, 0x00000001, 0x0003001d, 0x00000055, 
-0x00000006, 0x0003001e, 0x00000056, 0x00000055, 
-0x00040020, 0x00000057, 0x00000002, 0x00000056, 
-0x0004003b, 0x00000057, 0x00000058, 0x00000002, 
-0x0003001d, 0x0000005c, 0x00000006, 0x0003001e, 
-0x0000005d, 0x0000005c, 0x00040020, 0x0000005e, 
-0x00000002, 0x0000005d, 0x0004003b, 0x0000005e, 
-0x0000005f, 0x00000002, 0x0003001d, 0x00000063, 
-0x00000006, 0x0003001e, 0x00000064, 0x00000063, 
-0x00040020, 0x00000065, 0x00000002, 0x00000064, 
-0x0004003b, 0x00000065, 0x00000066, 0x00000002, 
-0x0003001d, 0x0000006c, 0x00000006, 0x0003001e, 
-0x0000006d, 0x0000006c, 0x00040020, 0x0000006e, 
-0x00000002, 0x0000006d, 0x0004003b, 0x0000006e, 
-0x0000006f, 0x00000002, 0x00040032, 0x00000006, 
-0x00000080, 0x00000000, 0x0003001d, 0x0000008a, 
-0x00000006, 0x0003001e, 0x0000008b, 0x0000008a, 
-0x00040020, 0x0000008c, 0x00000002, 0x0000008b, 
-0x0004003b, 0x0000008c, 0x0000008d, 0x00000002, 
-0x0003001d, 0x00000092, 0x00000006, 0x0003001e, 
-0x00000093, 0x00000092, 0x00040020, 0x00000094, 
-0x00000002, 0x00000093, 0x0004003b, 0x00000094, 
-0x00000095, 0x00000002, 0x0004002b, 0x0000003c, 
-0x00000097, 0x00000001, 0x0003001d, 0x0000009b, 
-0x00000006, 0x0003001e, 0x0000009c, 0x0000009b, 
-0x00040020, 0x0000009d, 0x00000002, 0x0000009c, 
-0x0004003b, 0x0000009d, 0x0000009e, 0x00000002, 
-0x0003001d, 0x000000a2, 0x00000006, 0x0003001e, 
-0x000000a3, 0x000000a2, 0x00040020, 0x000000a4, 
-0x00000002, 0x000000a3, 0x0004003b, 0x000000a4, 
-0x000000a5, 0x00000002, 0x0006002c, 0x0000003f, 
-0x000000ad, 0x00000097, 0x00000097, 0x00000097, 
-0x00050036, 0x00000002, 0x00000004, 0x00000000, 
-0x00000003, 0x000200f8, 0x00000005, 0x0004003b, 
-0x0000003d, 0x0000003e, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000046, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000054, 0x00000007, 0x0004003b, 
-0x0000000d, 0x0000005b, 0x00000007, 0x0004003b, 
-0x00000007, 0x0000006b, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000073, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000074, 0x00000007, 0x0004003b, 
-0x0000000d, 0x00000076, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000078, 0x00000007, 0x0004003b, 
-0x00000007, 0x0000007b, 0x00000007, 0x0004003b, 
-0x0000000d, 0x0000007f, 0x00000007, 0x0004003b, 
-0x00000007, 0x00000086, 0x00000007, 0x0004003b, 
-0x00000007, 0x000000a7, 0x00000007, 0x0004003b, 
-0x00000007, 0x000000a9, 0x00000007, 0x00050041, 
-0x00000043, 0x00000044, 0x00000041, 0x00000042, 
-0x0004003d, 0x0000003c, 0x00000045, 0x00000044, 
-0x0003003e, 0x0000003e, 0x00000045, 0x00060041, 
-0x0000004d, 0x0000004e, 0x0000004a, 0x0000004c, 
-0x0000004c, 0x0004003d, 0x00000006, 0x0000004f, 
-0x0000004e, 0x00060041, 0x0000004d, 0x00000051, 
-0x0000004a, 0x0000004c, 0x00000050, 0x0004003d, 
-0x00000006, 0x00000052, 0x00000051, 0x00050050, 
-0x0000000c, 0x00000053, 0x0000004f, 0x00000052, 
-0x0003003e, 0x00000046, 0x00000053, 0x00060041, 
-0x0000004d, 0x00000059, 0x00000058, 0x0000004c, 
-0x0000004c, 0x0004003d, 0x00000006, 0x0000005a, 
-0x00000059, 0x0003003e, 0x00000054, 0x0000005a, 
-0x0004003d, 0x0000003c, 0x00000060, 0x0000003e, 
-0x00060041, 0x0000004d, 0x00000061, 0x0000005f, 
-0x0000004c, 0x00000060, 0x0004003d, 0x00000006, 
-0x00000062, 0x00000061, 0x0004003d, 0x0000003c, 
-0x00000067, 0x0000003e, 0x00060041, 0x0000004d, 
-0x00000068, 0x00000066, 0x0000004c, 0x00000067, 
-0x0004003d, 0x00000006, 0x00000069, 0x00000068, 
-0x00050050, 0x0000000c, 0x0000006a, 0x00000062, 
-0x00000069, 0x0003003e, 0x0000005b, 0x0000006a, 
-0x0004003d, 0x0000003c, 0x00000070, 0x0000003e, 
-0x00060041, 0x0000004d, 0x00000071, 0x0000006f, 
-0x0000004c, 0x00000070, 0x0004003d, 0x00000006, 
-0x00000072, 0x00000071, 0x0003003e, 0x0000006b, 
-0x00000072, 0x0004003d, 0x0000000c, 0x00000075, 
-0x0000005b, 0x0003003e, 0x00000074, 0x00000075, 
-0x0004003d, 0x0000000c, 0x00000077, 0x00000046, 
-0x0003003e, 0x00000076, 0x00000077, 0x0004003d, 
-0x00000006, 0x00000079, 0x00000054, 0x0003003e, 
-0x00000078, 0x00000079, 0x00070039, 0x00000006, 
-0x0000007a, 0x00000012, 0x00000074, 0x00000076, 
-0x00000078, 0x0003003e, 0x00000073, 0x0000007a, 
-0x0004003d, 0x00000006, 0x0000007c, 0x00000073, 
-0x0004003d, 0x00000006, 0x0000007d, 0x0000006b, 
-0x00050083, 0x00000006, 0x0000007e, 0x0000007c, 
-0x0000007d, 0x0003003e, 0x0000007b, 0x0000007e, 
-0x00050088, 0x00000006, 0x00000081, 0x00000019, 
-0x00000080, 0x0004003d, 0x0000000c, 0x00000082, 
-0x0000005b, 0x0005008e, 0x0000000c, 0x00000083, 
-0x00000082, 0x00000081, 0x0004003d, 0x00000006, 
-0x00000084, 0x0000007b, 0x0005008e, 0x0000000c, 
-0x00000085, 0x00000083, 0x00000084, 0x0003003e, 
-0x0000007f, 0x00000085, 0x00050088, 0x00000006, 
-0x00000087, 0x00000019, 0x00000080, 0x0004003d, 
-0x00000006, 0x00000088, 0x0000007b, 0x00050085, 
-0x00000006, 0x00000089, 0x00000087, 0x00000088, 
-0x0003003e, 0x00000086, 0x00000089, 0x0004003d, 
-0x0000003c, 0x0000008e, 0x0000003e, 0x00050041, 
-0x00000007, 0x0000008f, 0x0000007f, 0x00000042, 
-0x0004003d, 0x00000006, 0x00000090, 0x0000008f, 
-0x00060041, 0x0000004d, 0x00000091, 0x0000008d, 
-0x0000004c, 0x0000008e, 0x0003003e, 0x00000091, 
-0x00000090, 0x0004003d, 0x0000003c, 0x00000096, 
-0x0000003e, 0x00050041, 0x00000007, 0x00000098, 
-0x0000007f, 0x00000097, 0x0004003d, 0x00000006, 
-0x00000099, 0x00000098, 0x00060041, 0x0000004d, 
-0x0000009a, 0x00000095, 0x0000004c, 0x00000096, 
-0x0003003e, 0x0000009a, 0x00000099, 0x0004003d, 
-0x0000003c, 0x0000009f, 0x0000003e, 0x0004003d, 
-0x00000006, 0x000000a0, 0x00000086, 0x00060041, 
-0x0000004d, 0x000000a1, 0x0000009e, 0x0000004c, 
-0x0000009f, 0x0003003e, 0x000000a1, 0x000000a0, 
-0x0004003d, 0x0000003c, 0x000000a6, 0x0000003e, 
-0x0004003d, 0x00000006, 0x000000a8, 0x00000073, 
-0x0003003e, 0x000000a7, 0x000000a8, 0x0004003d, 
-0x00000006, 0x000000aa, 0x0000006b, 0x0003003e, 
-0x000000a9, 0x000000aa, 0x00060039, 0x00000006, 
-0x000000ab, 0x00000017, 0x000000a7, 0x000000a9, 
-0x00060041, 0x0000004d, 0x000000ac, 0x000000a5, 
-0x0000004c, 0x000000a6, 0x0003003e, 0x000000ac, 
-0x000000ab, 0x000100fd, 0x00010038, 0x00050036, 
-0x00000006, 0x0000000a, 0x00000000, 0x00000008, 
-0x00030037, 0x00000007, 0x00000009, 0x000200f8, 
-0x0000000b, 0x0004003d, 0x00000006, 0x0000001a, 
-0x00000009, 0x0004007f, 0x00000006, 0x0000001b, 
-0x0000001a, 0x0006000c, 0x00000006, 0x0000001c, 
-0x00000001, 0x0000001b, 0x0000001b, 0x00050081, 
-0x00000006, 0x0000001d, 0x00000019, 0x0000001c, 
-0x00050088, 0x00000006, 0x0000001e, 0x00000019, 
-0x0000001d, 0x000200fe, 0x0000001e, 0x00010038, 
-0x00050036, 0x00000006, 0x00000012, 0x00000000, 
-0x0000000e, 0x00030037, 0x0000000d, 0x0000000f, 
-0x00030037, 0x0000000d, 0x00000010, 0x00030037, 
-0x00000007, 0x00000011, 0x000200f8, 0x00000013, 
-0x0004003b, 0x00000007, 0x00000021, 0x00000007, 
-0x0004003b, 0x00000007, 0x00000027, 0x00000007, 
-0x0004003b, 0x00000007, 0x00000028, 0x00000007, 
-0x0004003d, 0x0000000c, 0x00000022, 0x00000010, 
-0x0004003d, 0x0000000c, 0x00000023, 0x0000000f, 
-0x00050094, 0x00000006, 0x00000024, 0x00000022, 
-0x00000023, 0x0004003d, 0x00000006, 0x00000025, 
-0x00000011, 0x00050081, 0x00000006, 0x00000026, 
-0x00000024, 0x00000025, 0x0003003e, 0x00000021, 
-0x00000026, 0x0004003d, 0x00000006, 0x00000029, 
-0x00000021, 0x0003003e, 0x00000028, 0x00000029, 
-0x00050039, 0x00000006, 0x0000002a, 0x0000000a, 
-0x00000028, 0x0003003e, 0x00000027, 0x0000002a, 
-0x0004003d, 0x00000006, 0x0000002b, 0x00000027, 
-0x000200fe, 0x0000002b, 0x00010038, 0x00050036, 
-0x00000006, 0x00000017, 0x00000000, 0x00000014, 
-0x00030037, 0x00000007, 0x00000015, 0x00030037, 
-0x00000007, 0x00000016, 0x000200f8, 0x00000018, 
-0x0004003d, 0x00000006, 0x0000002e, 0x00000016, 
-0x0004003d, 0x00000006, 0x0000002f, 0x00000015, 
-0x0006000c, 0x00000006, 0x00000030, 0x00000001, 
-0x0000001c, 0x0000002f, 0x00050085, 0x00000006, 
-0x00000031, 0x0000002e, 0x00000030, 0x0004003d, 
-0x00000006, 0x00000032, 0x00000016, 0x00050083, 
-0x00000006, 0x00000033, 0x00000019, 0x00000032, 
-0x0004003d, 0x00000006, 0x00000034, 0x00000015, 
-0x00050083, 0x00000006, 0x00000035, 0x00000019, 
-0x00000034, 0x0006000c, 0x00000006, 0x00000036, 
-0x00000001, 0x0000001c, 0x00000035, 0x00050085, 
-0x00000006, 0x00000037, 0x00000033, 0x00000036, 
-0x00050081, 0x00000006, 0x00000038, 0x00000031, 
-0x00000037, 0x0004007f, 0x00000006, 0x00000039, 
-0x00000038, 0x000200fe, 0x00000039, 0x00010038 };
-} // namespace kp
-
-
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.comp b/kompute/src/shaders/glsl/ShaderOpMult.comp
deleted file mode 100644
index d54865037..000000000
--- a/kompute/src/shaders/glsl/ShaderOpMult.comp
+++ /dev/null
@@ -1,28 +0,0 @@
-#version 450
-
-layout(set = 0, binding = 0) buffer tensorLhs {
-   float valuesLhs[ ];
-};
-
-layout(set = 0, binding = 1) buffer tensorRhs {
-   float valuesRhs[ ];
-};
-
-layout(set = 0, binding = 2) buffer tensorOutput {
-   float valuesOutput[ ];
-};
-
-layout (constant_id = 0) const uint LEN_LHS = 0;
-layout (constant_id = 1) const uint LEN_RHS = 0;
-layout (constant_id = 2) const uint LEN_OUT = 0;
-
-layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
-
-void main() 
-{
-	uint index = gl_GlobalInvocationID.x;
-
-    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
-}
-
-
diff --git a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in b/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
deleted file mode 100644
index 5af29c66d..000000000
--- a/kompute/src/shaders/glsl/ShaderOpMult.hpp.in
+++ /dev/null
@@ -1,101 +0,0 @@
-#pragma once
-#include <array>
-#include <cstdint>
-
-namespace kp {
-const std::array<uint32_t, 366> SHADEROPMULT_COMP_SPV = { 
-0x07230203, 0x00010000, 0x0008000a, 0x0000002e, 
-0x00000000, 0x00020011, 0x00000001, 0x0006000b, 
-0x00000001, 0x4c534c47, 0x6474732e, 0x3035342e, 
-0x00000000, 0x0003000e, 0x00000000, 0x00000001, 
-0x0006000f, 0x00000005, 0x00000004, 0x6e69616d, 
-0x00000000, 0x0000000b, 0x00060010, 0x00000004, 
-0x00000011, 0x00000001, 0x00000001, 0x00000001, 
-0x00030003, 0x00000002, 0x000001c2, 0x00040005, 
-0x00000004, 0x6e69616d, 0x00000000, 0x00040005, 
-0x00000008, 0x65646e69, 0x00000078, 0x00080005, 
-0x0000000b, 0x475f6c67, 0x61626f6c, 0x766e496c, 
-0x7461636f, 0x496e6f69, 0x00000044, 0x00060005, 
-0x00000012, 0x736e6574, 0x754f726f, 0x74757074, 
-0x00000000, 0x00070006, 0x00000012, 0x00000000, 
-0x756c6176, 0x754f7365, 0x74757074, 0x00000000, 
-0x00030005, 0x00000014, 0x00000000, 0x00050005, 
-0x00000019, 0x736e6574, 0x684c726f, 0x00000073, 
-0x00060006, 0x00000019, 0x00000000, 0x756c6176, 
-0x684c7365, 0x00000073, 0x00030005, 0x0000001b, 
-0x00000000, 0x00050005, 0x00000021, 0x736e6574, 
-0x6852726f, 0x00000073, 0x00060006, 0x00000021, 
-0x00000000, 0x756c6176, 0x68527365, 0x00000073, 
-0x00030005, 0x00000023, 0x00000000, 0x00040005, 
-0x00000029, 0x5f4e454c, 0x0053484c, 0x00040005, 
-0x0000002a, 0x5f4e454c, 0x00534852, 0x00040005, 
-0x0000002b, 0x5f4e454c, 0x0054554f, 0x00040047, 
-0x0000000b, 0x0000000b, 0x0000001c, 0x00040047, 
-0x00000011, 0x00000006, 0x00000004, 0x00050048, 
-0x00000012, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000012, 0x00000003, 0x00040047, 
-0x00000014, 0x00000022, 0x00000000, 0x00040047, 
-0x00000014, 0x00000021, 0x00000002, 0x00040047, 
-0x00000018, 0x00000006, 0x00000004, 0x00050048, 
-0x00000019, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000019, 0x00000003, 0x00040047, 
-0x0000001b, 0x00000022, 0x00000000, 0x00040047, 
-0x0000001b, 0x00000021, 0x00000000, 0x00040047, 
-0x00000020, 0x00000006, 0x00000004, 0x00050048, 
-0x00000021, 0x00000000, 0x00000023, 0x00000000, 
-0x00030047, 0x00000021, 0x00000003, 0x00040047, 
-0x00000023, 0x00000022, 0x00000000, 0x00040047, 
-0x00000023, 0x00000021, 0x00000001, 0x00040047, 
-0x00000029, 0x00000001, 0x00000000, 0x00040047, 
-0x0000002a, 0x00000001, 0x00000001, 0x00040047, 
-0x0000002b, 0x00000001, 0x00000002, 0x00040047, 
-0x0000002d, 0x0000000b, 0x00000019, 0x00020013, 
-0x00000002, 0x00030021, 0x00000003, 0x00000002, 
-0x00040015, 0x00000006, 0x00000020, 0x00000000, 
-0x00040020, 0x00000007, 0x00000007, 0x00000006, 
-0x00040017, 0x00000009, 0x00000006, 0x00000003, 
-0x00040020, 0x0000000a, 0x00000001, 0x00000009, 
-0x0004003b, 0x0000000a, 0x0000000b, 0x00000001, 
-0x0004002b, 0x00000006, 0x0000000c, 0x00000000, 
-0x00040020, 0x0000000d, 0x00000001, 0x00000006, 
-0x00030016, 0x00000010, 0x00000020, 0x0003001d, 
-0x00000011, 0x00000010, 0x0003001e, 0x00000012, 
-0x00000011, 0x00040020, 0x00000013, 0x00000002, 
-0x00000012, 0x0004003b, 0x00000013, 0x00000014, 
-0x00000002, 0x00040015, 0x00000015, 0x00000020, 
-0x00000001, 0x0004002b, 0x00000015, 0x00000016, 
-0x00000000, 0x0003001d, 0x00000018, 0x00000010, 
-0x0003001e, 0x00000019, 0x00000018, 0x00040020, 
-0x0000001a, 0x00000002, 0x00000019, 0x0004003b, 
-0x0000001a, 0x0000001b, 0x00000002, 0x00040020, 
-0x0000001d, 0x00000002, 0x00000010, 0x0003001d, 
-0x00000020, 0x00000010, 0x0003001e, 0x00000021, 
-0x00000020, 0x00040020, 0x00000022, 0x00000002, 
-0x00000021, 0x0004003b, 0x00000022, 0x00000023, 
-0x00000002, 0x00040032, 0x00000006, 0x00000029, 
-0x00000000, 0x00040032, 0x00000006, 0x0000002a, 
-0x00000000, 0x00040032, 0x00000006, 0x0000002b, 
-0x00000000, 0x0004002b, 0x00000006, 0x0000002c, 
-0x00000001, 0x0006002c, 0x00000009, 0x0000002d, 
-0x0000002c, 0x0000002c, 0x0000002c, 0x00050036, 
-0x00000002, 0x00000004, 0x00000000, 0x00000003, 
-0x000200f8, 0x00000005, 0x0004003b, 0x00000007, 
-0x00000008, 0x00000007, 0x00050041, 0x0000000d, 
-0x0000000e, 0x0000000b, 0x0000000c, 0x0004003d, 
-0x00000006, 0x0000000f, 0x0000000e, 0x0003003e, 
-0x00000008, 0x0000000f, 0x0004003d, 0x00000006, 
-0x00000017, 0x00000008, 0x0004003d, 0x00000006, 
-0x0000001c, 0x00000008, 0x00060041, 0x0000001d, 
-0x0000001e, 0x0000001b, 0x00000016, 0x0000001c, 
-0x0004003d, 0x00000010, 0x0000001f, 0x0000001e, 
-0x0004003d, 0x00000006, 0x00000024, 0x00000008, 
-0x00060041, 0x0000001d, 0x00000025, 0x00000023, 
-0x00000016, 0x00000024, 0x0004003d, 0x00000010, 
-0x00000026, 0x00000025, 0x00050085, 0x00000010, 
-0x00000027, 0x0000001f, 0x00000026, 0x00060041, 
-0x0000001d, 0x00000028, 0x00000014, 0x00000016, 
-0x00000017, 0x0003003e, 0x00000028, 0x00000027, 
-0x000100fd, 0x00010038 };
-} // namespace kp
-
-
diff --git a/kompute/src/shaders/hlsl/computeheadless.comp b/kompute/src/shaders/hlsl/computeheadless.comp
deleted file mode 100644
index ee3cd024f..000000000
--- a/kompute/src/shaders/hlsl/computeheadless.comp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright 2020 Google LLC
-
-RWStructuredBuffer<uint> values : register(u0);
-[[vk::constant_id(0)]] const uint BUFFER_ELEMENTS = 32;
-
-uint fibonacci(uint n) {
-	if(n <= 1){
-		return n;
-	}
-	uint curr = 1;
-	uint prev = 1;
-	for(uint i = 2; i < n; ++i) {
-		uint temp = curr;
-		curr += prev;
-		prev = temp;
-	}
-	return curr;
-}
-
-[numthreads(1, 1, 1)]
-void main(uint3 GlobalInvocationID : SV_DispatchThreadID)
-{
-	uint index = GlobalInvocationID.x;
-	if (index >= BUFFER_ELEMENTS)
-		return;
-	values[index] = fibonacci(values[index]);
-}
-
-

From f7cb0a65ef7a6e5cf1ad318ca8514abef74771bd Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 13 Dec 2023 17:55:41 -0500
Subject: [PATCH 74/93] remove script with unclear purpose

---
 undump.py | 19 -------------------
 1 file changed, 19 deletions(-)
 delete mode 100755 undump.py

diff --git a/undump.py b/undump.py
deleted file mode 100755
index c3d8993be..000000000
--- a/undump.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env python3
-import struct
-import numpy as np
-from pathlib import Path
-
-def undump(fn):
-    with open(fn, 'rb') as df:
-        dims = struct.unpack('=QQQQ', df.read(8*4))
-        (dsz,) = struct.unpack('=Q', df.read(8))
-        ## assume f32
-        data = df.read(dsz)
-        data = [i for (i,) in struct.iter_unpack('=f', data)]
-        return np.array(data).reshape(dims).squeeze()
-
-if __name__ == '__main__':
-    for dfn in sorted(Path('.').glob('*.dump')):
-        darr = undump(dfn)
-        print(f'{dfn}: {darr.shape}\n{darr}')
-

From c8fd4ba8465db1fcf2af020f9b8ac0937e2721a2 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 14 Dec 2023 13:18:14 -0500
Subject: [PATCH 75/93] ggml : restore 'static' specifiers

---
 ggml.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2eaba0a82..f743df1f3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -7104,7 +7104,7 @@ static void ggml_compute_forward_add_q_f32(
     }
 }
 
-void ggml_compute_forward_add(
+static void ggml_compute_forward_add(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -7682,7 +7682,7 @@ static void ggml_compute_forward_mul_f32(
     }
 }
 
-void ggml_compute_forward_mul(
+static void ggml_compute_forward_mul(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -8653,7 +8653,7 @@ static void ggml_compute_forward_elu(
 
 // ggml_compute_forward_relu
 
-void ggml_compute_forward_relu_f32(
+static void ggml_compute_forward_relu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8677,7 +8677,7 @@ void ggml_compute_forward_relu_f32(
     }
 }
 
-void ggml_compute_forward_relu(
+static void ggml_compute_forward_relu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8695,7 +8695,7 @@ void ggml_compute_forward_relu(
 
 // ggml_compute_forward_gelu
 
-void ggml_compute_forward_gelu_f32(
+static void ggml_compute_forward_gelu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8736,7 +8736,7 @@ void ggml_compute_forward_gelu_f32(
     }
 }
 
-void ggml_compute_forward_gelu(
+static void ggml_compute_forward_gelu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8813,7 +8813,7 @@ static void ggml_compute_forward_gelu_quick(
 
 // ggml_compute_forward_silu
 
-void ggml_compute_forward_silu_f32(
+static void ggml_compute_forward_silu_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -8854,7 +8854,7 @@ void ggml_compute_forward_silu_f32(
     }
 }
 
-void ggml_compute_forward_silu(
+static void ggml_compute_forward_silu(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9029,7 +9029,7 @@ static void ggml_compute_forward_norm_f32(
     }
 }
 
-void ggml_compute_forward_norm(
+static void ggml_compute_forward_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9095,7 +9095,7 @@ static void ggml_compute_forward_rms_norm_f32(
     }
 }
 
-void ggml_compute_forward_rms_norm(
+static void ggml_compute_forward_rms_norm(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -9999,7 +9999,7 @@ static void ggml_compute_forward_scale_f32(
     }
 }
 
-void ggml_compute_forward_scale(
+static void ggml_compute_forward_scale(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10120,7 +10120,7 @@ static void ggml_compute_forward_set(
 
 // ggml_compute_forward_cpy
 
-void ggml_compute_forward_cpy(
+static void ggml_compute_forward_cpy(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10264,7 +10264,7 @@ static void ggml_compute_forward_get_rows_f32(
     }
 }
 
-void ggml_compute_forward_get_rows(
+static void ggml_compute_forward_get_rows(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -10536,7 +10536,7 @@ static void ggml_compute_forward_diag_mask_f32(
     }
 }
 
-void ggml_compute_forward_diag_mask_inf(
+static void ggml_compute_forward_diag_mask_inf(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
@@ -10570,7 +10570,7 @@ static void ggml_compute_forward_diag_mask_zero(
 
 // ggml_compute_forward_soft_max
 
-void ggml_compute_forward_soft_max_f32(
+static void ggml_compute_forward_soft_max_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -11359,7 +11359,7 @@ static void ggml_compute_forward_rope_f16(
     }
 }
 
-void ggml_compute_forward_rope(
+static void ggml_compute_forward_rope(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,

From f58f581ca8e4eef9bff1d98965e4a049a7a14cb5 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 15 Dec 2023 13:38:54 -0500
Subject: [PATCH 76/93] refactor llama.cpp modifications

---
 llama.cpp | 96 +++++++++++++++++++++----------------------------------
 1 file changed, 37 insertions(+), 59 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 82e1abbbd..f7c6f26d2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -731,7 +731,7 @@ static std::string llama_format_win_err(DWORD err) {
 struct llama_buffer {
     void * data = NULL;
     size_t size = 0;
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
     ggml_vk_memory memory;
 #endif
 
@@ -742,7 +742,7 @@ struct llama_buffer {
     void resize(size_t n) {
         llama_host_free(data);
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
         if (ggml_vk_has_device()) {
             this->memory = ggml_vk_allocate(n);
             this->data = (uint8_t*)memory.data;
@@ -764,7 +764,7 @@ struct llama_buffer {
 
     ~llama_buffer() {
         if (data) {
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
             if (ggml_vk_has_device()) {
                 ggml_vk_free_memory(memory);
                 data = NULL;
@@ -1517,7 +1517,6 @@ struct llama_context {
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
 #endif
-
 };
 
 //
@@ -2113,7 +2112,7 @@ struct llama_model_loader {
             use_mmap = false;
         }
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
         use_mmap = false;
 #endif
         this->use_mmap = use_mmap;
@@ -3790,8 +3789,7 @@ static struct ggml_tensor * llm_build_inp_embd(
         const llama_hparams & hparams,
           const llama_batch & batch,
          struct ggml_tensor * tok_embd,
-         const llm_build_cb & cb,
-        struct ggml_tensor ** to_device_tensor = nullptr) {
+         const llm_build_cb & cb) {
     const int64_t n_embd = hparams.n_embd;
 
     struct ggml_tensor * inpL;
@@ -3799,9 +3797,6 @@ static struct ggml_tensor * llm_build_inp_embd(
     if (batch.token) {
         struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, batch.n_tokens);
         cb(inp_tokens, "inp_tokens", -1);
-        if (to_device_tensor) {
-            *to_device_tensor = inp_tokens;
-        }
 
         inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
     } else {
@@ -3810,9 +3805,6 @@ static struct ggml_tensor * llm_build_inp_embd(
 #endif
 
         inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens);
-        if (to_device_tensor) {
-            *to_device_tensor = inpL;
-        }
     }
 
     return inpL;
@@ -3820,7 +3812,7 @@ static struct ggml_tensor * llm_build_inp_embd(
 
 // Persimmon: n_rot = n_embd_head/2
 // Other:     n_rot = n_embd_head
-static struct ggml_tensor * llm_build_k_shift(
+static void llm_build_k_shift(
       struct ggml_context * ctx,
       const llama_hparams & hparams,
       const llama_cparams & cparams,
@@ -3869,8 +3861,6 @@ static struct ggml_tensor * llm_build_k_shift(
         cb(tmp, "K_shifted", il);
         ggml_build_forward_expand(graph, tmp);
     }
-
-    return K_shift;
 }
 
 static void llm_build_kv_store(
@@ -4148,7 +4138,7 @@ struct llm_build_context {
 
     llama_buffer & buf_compute;
 
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
     ggml_kompute_context * ctx_kompute;
 #endif
 
@@ -4187,7 +4177,7 @@ struct llm_build_context {
         do_rope_shift (worst_case || kv_self.has_shift),
         cb            (cb),
         buf_compute   (lctx.buf_compute)
-#if defined(GGML_USE_KOMPUTE)
+#ifdef GGML_USE_KOMPUTE
       , ctx_kompute   (lctx.ctx_kompute)
 #endif
         {
@@ -4220,9 +4210,8 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        struct ggml_tensor * to_device_tensor = nullptr;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb, &to_device_tensor);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
@@ -4238,9 +4227,8 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
-        struct ggml_tensor * K_shift = nullptr;
         if (do_rope_shift) {
-            K_shift = llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4336,21 +4324,6 @@ struct llm_build_context {
 
         ggml_build_forward_expand(gf, cur);
 
-#if defined(GGML_USE_KOMPUTE)
-        if (ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
-                ggml_vk_h2d_all(ctx_kompute);
-            } else {
-                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
-                if (K_shift) {
-                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
-                }
-            }
-        }
-#endif
-
         return gf;
     }
 
@@ -4479,9 +4452,8 @@ struct llm_build_context {
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
-        struct ggml_tensor * to_device_tensor = nullptr;
 
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb, &to_device_tensor);
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         // inp_pos - contains the positions
@@ -4497,9 +4469,8 @@ struct llm_build_context {
         cb(KQ_mask, "KQ_mask", -1);
 
         // shift the entire K-cache if needed
-        struct ggml_tensor * K_shift = nullptr;
         if (do_rope_shift) {
-            K_shift = llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
+            llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
         }
 
         for (int il = 0; il < n_layer; ++il) {
@@ -4595,21 +4566,6 @@ struct llm_build_context {
 
         ggml_build_forward_expand(gf, cur);
 
-#if defined(GGML_USE_KOMPUTE)
-        if (ctx_kompute) {
-            if (!ggml_vk_has_h2d_all(ctx_kompute)) {
-                ggml_vk_h2d_all(ctx_kompute);
-            } else {
-                ggml_vk_h2d_tensor(ctx_kompute, to_device_tensor);
-                ggml_vk_h2d_tensor(ctx_kompute, inp_pos);
-                ggml_vk_h2d_tensor(ctx_kompute, KQ_mask);
-                if (K_shift) {
-                    ggml_vk_h2d_tensor(ctx_kompute, K_shift);
-                }
-            }
-        }
-#endif
-
         return gf;
     }
 
@@ -5627,6 +5583,10 @@ static struct ggml_cgraph * llama_build_graph(
     const bool do_offload = true; // TODO: set to false after finishing refactoring
 #endif
 
+#ifdef GGML_USE_KOMPUTE
+    const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
+#endif
+
     int n_non_view = 0; // number of non-view tensors that have been processed by the callback
 
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
@@ -5747,6 +5707,21 @@ static struct ggml_cgraph * llama_build_graph(
             n_non_view++;
         }
 
+#ifdef GGML_USE_KOMPUTE
+        if (lctx.ctx_kompute && !needs_h2d_all) {
+            const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
+            for (auto off : offload_tensors) {
+                if (strcmp(name, off) == 0) {
+                    ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
+                    break;
+                }
+            }
+            if (strcmp(name, "inp_embd") == 0 && !batch.token) {
+                ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
+            }
+        }
+#endif
+
         //
         // offload layers
         //
@@ -5915,6 +5890,12 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
+#ifdef GGML_USE_KOMPUTE
+        if (needs_h2d_all) {
+            ggml_vk_h2d_all(lctx.ctx_kompute);
+        }
+#endif
+
     llm.free();
 
     if (worst_case) {
@@ -6175,7 +6156,6 @@ static int llama_decode_internal(
         }
     }
 
-#if 0
     // extract embeddings
     if (!lctx.embedding.empty()) {
         auto & embedding_out = lctx.embedding;
@@ -6183,7 +6163,6 @@ static int llama_decode_internal(
         embedding_out.resize(n_embd);
         memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(n_tokens - 1)), sizeof(float)*n_embd);
     }
-#endif
 
     // measure the performance only for the single-token evals
     if (n_tokens == 1) {
@@ -8622,7 +8601,6 @@ static int llama_apply_lora_from_file_internal(
 ) {
     LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
 
-
     const int64_t t_start_lora_us = ggml_time_us();
 
     auto fin = std::ifstream(path_lora, std::ios::binary);

From 2d2c76acc42215e2ca11cf2d0a9f788324df66df Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 29 Nov 2023 18:17:57 -0500
Subject: [PATCH 77/93] vulkan : fix free of stack addr in llama_buffer

---
 llama.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index f7c6f26d2..ad431a27e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -765,8 +765,10 @@ struct llama_buffer {
     ~llama_buffer() {
         if (data) {
 #ifdef GGML_USE_KOMPUTE
-            if (ggml_vk_has_device()) {
-                ggml_vk_free_memory(memory);
+            if (memory.data) {
+                if (ggml_vk_has_device()) {
+                    ggml_vk_free_memory(memory);
+                }
                 data = NULL;
                 return;
             }

From 807270621016865bb5fb136295e962b47d4bf06d Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Fri, 15 Dec 2023 16:23:24 -0500
Subject: [PATCH 78/93] kompute : always destroy Manager via the destructor

---
 ggml-kompute.cpp | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index f70231bed..cc0adaf2f 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -63,17 +63,29 @@ struct ggml_kompute_context {
 // and consolidate the init functions and simplify object lifetime management. As it currently stands,
 // we *have* to have the kompute manager no matter what for device discovery, but the kompute context
 // is only created when a device is set and vulkan is explicitly turned on.
-ggml_kompute_context *s_kompute_context = nullptr;
-static kp::Manager *komputeManager() {
-    static kp::Manager *s_mgr = nullptr;
-    if (s_mgr && !s_mgr->hasInstance()) {
+static ggml_kompute_context *s_kompute_context = nullptr;
+
+class kompute_manager {
+    kp::Manager *s_mgr = nullptr;
+
+public:
+    kp::Manager *operator()() {
+        if (s_mgr && !s_mgr->hasInstance()) {
+            destroy();
+        }
+        if (!s_mgr) {
+            s_mgr = new kp::Manager;
+        }
+        return s_mgr;
+    }
+
+    void destroy() {
         delete s_mgr;
         s_mgr = nullptr;
     }
-    if (!s_mgr)
-        s_mgr = new kp::Manager;
-    return s_mgr;
-}
+};
+
+static kompute_manager komputeManager;
 
 #ifdef __linux__
 __attribute__((constructor))
@@ -257,7 +269,7 @@ bool ggml_vk_init_device(int device) {
 bool ggml_vk_free_device() {
     if (!ggml_vk_has_device())
         return false;
-    komputeManager()->destroy();
+    komputeManager.destroy();
     // FIXME: The lifetime of these two needs to be tied together as we're relying upon the fact
     // the llama_free(ctx) destroys this memory and we just set the singleton to nullptr here which
     // is very brittle

From 44b1a97a15dd642c3938de94e5eeea5aabc4fc87 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Mon, 11 Dec 2023 13:04:43 -0500
Subject: [PATCH 79/93] kompute : fix -Wunused-private-field warnings from
 clang

Fixes nomic-ai/gpt4all#1722

(cherry picked from commit 3cd95323d995af7df4b42f6461f3d919a9267dad)
---
 kompute | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kompute b/kompute
index 2d0a8abc6..4565194ed 160000
--- a/kompute
+++ b/kompute
@@ -1 +1 @@
-Subproject commit 2d0a8abc64e90a0956390aa3f1854cb6d48141db
+Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306

From 904c563dbc4620ac8f1f085a26441ecca68437a4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 12:12:59 -0500
Subject: [PATCH 80/93] sync xxd commands with GPT4All llama.cpp.cmake

---
 CMakeLists.txt | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f906de40..78c19da2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -440,19 +440,35 @@ if (LLAMA_KOMPUTE)
         string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
         set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
         message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        add_custom_command(
-          OUTPUT ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          DEPENDS ${spv_file}
-          COMMENT "Converting to hpp: ${FILE_NAME}"
-        )
+        if(CMAKE_GENERATOR MATCHES "Visual Studio")
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
+            )
+        else()
+            add_custom_command(
+              OUTPUT ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+              COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+              DEPENDS ${spv_file} xxd
+              COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+            )
+        endif()
       endforeach()
     endfunction()
 

From 1eb8804c18c8b2afa623c6014eb04f2edacf04d3 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 11:29:04 -0500
Subject: [PATCH 81/93] PR #4766

---
 common/common.cpp                        |   65 +-
 common/common.h                          |    1 +
 examples/batched-bench/batched-bench.cpp |    3 +
 ggml-alloc.c                             |   12 +
 ggml-backend-impl.h                      |   29 +-
 ggml-backend.c                           |  477 +++--
 ggml-backend.h                           |   55 +-
 ggml-cuda.cu                             |  828 ++++----
 ggml-cuda.h                              |   26 +-
 ggml-impl.h                              |    2 +
 ggml-metal.m                             |   39 +-
 ggml-opencl.cpp                          |  335 +++-
 ggml-opencl.h                            |   16 +-
 ggml.c                                   |   30 +-
 ggml.h                                   |    9 +-
 llama.cpp                                | 2294 ++++++++--------------
 llama.h                                  |   17 +-
 tests/test-backend-ops.cpp               |   26 +-
 18 files changed, 2183 insertions(+), 2081 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 4e89fe516..4a6241fb5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -543,9 +543,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
             params.n_gpu_layers = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -554,9 +553,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
             params.n_gpu_layers_draft = std::stoi(argv[i]);
-#else
+#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
             fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -565,25 +563,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
             params.main_gpu = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
-#endif
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--split-mode" || arg == "-sm") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string arg_next = argv[i];
+            if (arg_next == "none") {
+                params.split_mode = LLAMA_SPLIT_NONE;
+            } else if (arg_next == "layer") {
+                params.split_mode = LLAMA_SPLIT_LAYER;
+            } else if (arg_next == "row") {
+                params.split_mode = LLAMA_SPLIT_ROW;
+            } else {
+                invalid_param = true;
+                break;
+            }
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUBLAS
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
             std::string arg_next = argv[i];
 
             // split string by , and /
             const std::regex regex{R"([,/]+)"};
             std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
             std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
-
+            if (split_arg.size() >= LLAMA_MAX_DEVICES) {
+                invalid_param = true;
+                break;
+            }
             for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                 if (i < split_arg.size()) {
                     params.tensor_split[i] = std::stof(split_arg[i]);
@@ -591,14 +608,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUBLAS
-        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
-#ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = false;
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
+#ifndef GGML_USE_CUBLAS
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
@@ -909,14 +920,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        number of layers to store in VRAM\n");
     printf("  -ngld N, --n-gpu-layers-draft N\n");
     printf("                        number of layers to store in VRAM for the draft model\n");
+    printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
+    printf("                        how to split the model across multiple GPUs, one of:\n");
+    printf("                          - none: use one GPU only\n");
+    printf("                          - layer (default): split layers and KV across GPUs\n");
+    printf("                          - row: split rows across GPUs\n");
     printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-#ifdef GGML_USE_CUBLAS
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
-#endif // GGML_USE_CUBLAS
+    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
+    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
+    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
 #endif
     printf("  -gan N, --grp-attn-n N\n");
     printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
@@ -1033,6 +1045,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
         mparams.n_gpu_layers = params.n_gpu_layers;
     }
     mparams.main_gpu        = params.main_gpu;
+    mparams.split_mode      = params.split_mode;
     mparams.tensor_split    = params.tensor_split;
     mparams.use_mmap        = params.use_mmap;
     mparams.use_mlock       = params.use_mlock;
diff --git a/common/common.h b/common/common.h
index e2bbfc258..5152b36d3 100644
--- a/common/common.h
+++ b/common/common.h
@@ -59,6 +59,7 @@ struct gpt_params {
     float   p_split                         = 0.1f;  // speculative decoding split probability
     int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
     int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
+    llama_split_mode split_mode             = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
     int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
     float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
     int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 57596ed98..7924db267 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -88,7 +88,10 @@ int main(int argc, char ** argv) {
 
     llama_model_params model_params = llama_model_default_params();
 
+    const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
+
     model_params.n_gpu_layers = n_gpu_layers;
+    model_params.tensor_split = t_split.data();
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
diff --git a/ggml-alloc.c b/ggml-alloc.c
index a27dd54b0..7836f064e 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -229,6 +229,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
         alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
     } else {
         alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
+        ggml_backend_buffer_reset(alloc->buffer);
     }
 }
 
@@ -779,10 +780,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
 
     if (nbytes == 0) {
         // all the tensors in the context are already allocated
+#ifndef NDEBUG
+        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
         return NULL;
     }
 
     ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+    if (buffer == NULL) {
+        // failed to allocate buffer
+#ifndef NDEBUG
+        fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+#endif
+        return NULL;
+    }
+
     ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
 
     for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index ca21b4743..859e923e2 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -16,9 +16,10 @@ extern "C" {
     typedef void * ggml_backend_buffer_type_context_t;
 
     struct ggml_backend_buffer_type_i {
+        const char *          (*get_name)        (ggml_backend_buffer_type_t buft);
         ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
         size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
         bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         // check if tensor data is in host memory
         // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
@@ -34,16 +35,17 @@ extern "C" {
     typedef void * ggml_backend_buffer_context_t;
 
     struct ggml_backend_buffer_i {
-        void   (*free_buffer)    (ggml_backend_buffer_t buffer);
-        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void * (*get_base)       (ggml_backend_buffer_t buffer);
-        void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        const char * (*get_name)       (ggml_backend_buffer_t buffer);
+        void         (*free_buffer)    (ggml_backend_buffer_t buffer);
+        void *       (*get_base)       (ggml_backend_buffer_t buffer);
+        void         (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
         // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*cpy_tensor_from)(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        void         (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        void         (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*reset)          (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
     };
 
     struct ggml_backend_buffer {
@@ -51,6 +53,7 @@ extern "C" {
         ggml_backend_buffer_type_t    buft;
         ggml_backend_buffer_context_t context;
         size_t size;
+        enum ggml_backend_buffer_usage usage;
     };
 
     ggml_backend_buffer_t ggml_backend_buffer_init(
@@ -79,13 +82,13 @@ extern "C" {
         void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
         // (optional) asynchroneous tensor copy
-        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_from_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
         void (*synchronize)(ggml_backend_t backend);
 
         // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
         void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
         void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
diff --git a/ggml-backend.c b/ggml-backend.c
index 53e741cb8..535426b9a 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -15,6 +15,10 @@
 
 // backend buffer type
 
+const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name(buft);
+}
+
 ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     return buft->iface.alloc_buffer(buft, size);
 }
@@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
         /* .buft      = */ buft,
         /* .context   = */ context,
         /* .size      = */ size,
+        /* .usage     = */ GGML_BACKEND_BUFFER_USAGE_ANY
     };
 
     return buffer;
 }
 
+const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name(buffer);
+}
+
 void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
     if (buffer == NULL) {
         return;
@@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t
 }
 
 size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
+    return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
 }
 
 size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
-    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
+    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
 }
 
 void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -106,13 +115,23 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
 }
 
 bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
-    return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
+    return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
 }
 
-ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
+void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+    buffer->usage = usage;
+}
+
+ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
     return buffer->buft;
 }
 
+void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
+    if (buffer->iface.reset) {
+        buffer->iface.reset(buffer);
+    }
+}
+
 // backend
 
 const char * ggml_backend_name(ggml_backend_t backend) {
@@ -392,6 +411,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
 
 // backend CPU
 
+static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CPU";
+
+    GGML_UNUSED(buffer);
+}
+
 static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
@@ -412,13 +437,13 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
+static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
 
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
+static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
 
     GGML_UNUSED(buffer);
@@ -429,6 +454,7 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 }
 
 static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
+    /* .get_name        = */ ggml_backend_cpu_buffer_name,
     /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -437,10 +463,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
     /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
     /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
     /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
 };
 
 // for buffers from ptr, free is not called
 static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+    /* .get_name        = */ ggml_backend_cpu_buffer_name,
     /* .free_buffer     = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
     /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
     /* .init_tensor     = */ NULL, // no initialization required
@@ -449,10 +477,17 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
     /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
     /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
     /* .clear           = */ ggml_backend_cpu_buffer_clear,
+    /* .reset           = */ NULL,
 };
 
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
+static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU";
+
+    GGML_UNUSED(buft);
+}
+
 static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
@@ -483,6 +518,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft
 ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
         /* .iface = */ {
+            /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
             /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -501,6 +537,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
 
 #include <hbwmalloc.h>
 
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buft);
+}
+
+static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
+    return "CPU_HBM";
+
+    GGML_UNUSED(buf);
+}
+
 static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     hbw_free(buffer->context);
 }
@@ -514,17 +562,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
         return NULL;
     }
 
-    // FIXME: this is a hack to avoid having to implement a new buffer type
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
     buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
 
     return buffer;
 }
 
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
         /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cpu_hbm_buffer_type_get_name,
             /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -568,7 +617,7 @@ struct ggml_backend_plan_cpu {
     struct ggml_cgraph cgraph;
 };
 
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -661,7 +710,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
 }
 
 bool ggml_backend_is_cpu(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cpu_name;
+    return backend && backend->iface.get_name == ggml_backend_cpu_name;
 }
 
 void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
@@ -685,7 +734,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user
 
 // scheduler
 
-#define GGML_MAX_BACKENDS 4
+#define GGML_MAX_BACKENDS 16
 #define GGML_MAX_SPLITS 256
 #define GGML_MAX_SPLIT_INPUTS 16
 
@@ -695,9 +744,16 @@ struct ggml_backend_sched_split {
     int i_end;
     struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
     int n_inputs;
+    // graph view of this split
     struct ggml_cgraph graph;
 };
 
+// TODO: group all the hash values into a single struct for clarity
+//struct sched_hash_value {
+//    ggml_tallocr_t tallocr;
+//    struct ggml_tensor * copies[GGML_MAX_BACKENDS];
+//};
+
 struct ggml_backend_sched {
     int n_backends;
     ggml_backend_t backends[GGML_MAX_BACKENDS];
@@ -705,11 +761,15 @@ struct ggml_backend_sched {
 
     ggml_gallocr_t galloc;
 
+    // hash keys of the nodes in the graph
     struct ggml_hash_set    hash_set;
-    ggml_tallocr_t *        node_talloc;                     // [hash_set.size]
-    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
+    // hash values (arrays of [hash_set.size])
+    ggml_tallocr_t *        node_talloc;                     // tallocr assigned to each node (indirectly this is the backend)
+    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
 
+    // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
+
     struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
     int n_splits;
 
@@ -777,7 +837,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc
 }
 
 #if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
 #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
 #define GET_CAUSE(node) causes[hash_id(node)]
 #else
@@ -790,6 +850,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
     // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
     // ie. kv cache updates
     // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
+
     // dst
     ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
     if (cur_backend != NULL) {
@@ -804,7 +865,6 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
     }
 
     // src
-    int cur_prio = INT_MAX;
     size_t cur_size = 0;
 
     for (int i = 0; i < GGML_MAX_SRC; i++) {
@@ -812,16 +872,20 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
         if (src == NULL) {
             break;
         }
+
         ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
-        if (src_backend != NULL) {
-            int src_prio = sched_backend_prio(sched, src_backend);
-            size_t src_size = ggml_nbytes(src);
-            if (src_prio < cur_prio && src_size >= cur_size) {
-                cur_prio = src_prio;
-                cur_size = src_size;
-                cur_backend = src_backend;
-                SET_CAUSE(node, "1.src%d", i);
-            }
+        if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+            // operations with weights are always on the same backend as the weights
+            cur_backend = src_backend;
+            SET_CAUSE(node, "1.wgt%d", i);
+            break;
+        }
+
+        size_t src_size = ggml_nbytes(src);
+        if (src_size >= cur_size) {
+            cur_size = src_size;
+            cur_backend = src_backend;
+            SET_CAUSE(node, "1.src%d", i);
         }
     }
     return cur_backend;
@@ -857,7 +921,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
         }
         ggml_tallocr_t node_allocr = node_allocr(node);
         ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
-        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
+        fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
             fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
@@ -866,7 +930,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra
             }
             ggml_tallocr_t src_allocr = node_allocr(src);
             ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
-            fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
+            fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
                 fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
         }
         fprintf(stderr, "\n");
@@ -882,14 +946,16 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
     return dup;
 }
 
+
+//#define DEBUG_PASS1
+//#define DEBUG_PASS2
+//#define DEBUG_PASS3
+//#define DEBUG_PASS4
+
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 // TODO: merge passes
 static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    // reset state
-    size_t hash_size = sched->hash_set.size;
-    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
-    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
-    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
+    // reset splits
     sched->n_splits = 0;
 
     struct ggml_init_params params = {
@@ -898,11 +964,13 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         /* .no_alloc =   */ true
     };
 
-    if (sched->ctx != NULL) {
-        ggml_free(sched->ctx);
-    }
+    ggml_free(sched->ctx);
 
     sched->ctx = ggml_init(params);
+    if (sched->ctx == NULL) {
+        fprintf(stderr, "%s: failed to initialize context\n", __func__);
+        GGML_ASSERT(false);
+    }
 
     // pass 1: assign backends to ops with allocated inputs
     for (int i = 0; i < graph->n_leafs; i++) {
@@ -931,45 +999,91 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
         }
     }
-    //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS1
+    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
     // pass 2: assign backends to ops from current assignments
-    // TODO:
-    //  - reuse sched_backend_from_cur
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t node_allocr = node_allocr(node);
-        if (node_allocr == NULL) {
-            int    cur_prio = INT_MAX;
-            size_t cur_size = 0;
-            for (int j = 0; j < GGML_MAX_SRC; j++) {
-                struct ggml_tensor * src = node->src[j];
-                if (src == NULL) {
-                    break;
-                }
-                ggml_tallocr_t src_allocr = node_allocr(src);
-                if (src_allocr != NULL) {
-                    int    src_prio = sched_allocr_prio(sched, src_allocr);
-                    size_t src_size = ggml_nbytes(src);
-                    if (src_prio < cur_prio && src_size >= cur_size) {
-                        cur_prio = src_prio;
-                        cur_size = src_size;
-                        node_allocr = src_allocr;
-                        SET_CAUSE(node, "2.src%d", j);
-                    }
-                }
+    // start from the end and assign the same backend to previous ops
+
+    // expand gpu backends (i.e. non last prio) up and down, ignoring cpu
+    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+
+    // pass 2.1 expand gpu up
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
             }
+            ggml_tallocr_t node_allocr = node_allocr(node);
             if (node_allocr != NULL) {
-                node_allocr(node) = node_allocr;
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    // skip cpu
+                    cur_allocr = NULL;
+                } else {
+                    cur_allocr = node_allocr;
+                }
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.cur");
             }
         }
     }
-    //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
 
-    // pass 3: assign backends to remaining src from dst (should only be leafs)
+    // pass 2.2 expand gpu down
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    // skip cpu
+                    cur_allocr = NULL;
+                } else {
+                    cur_allocr = node_allocr;
+                }
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.cur");
+            }
+        }
+    }
+
+    // pass 2.3 expand rest up
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                cur_allocr = node_allocr;
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.cur");
+            }
+        }
+    }
+#ifdef DEBUG_PASS2
+    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
+
+    // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t node_allocr = node_allocr(node);
+        ggml_tallocr_t cur_allocr = node_allocr(node);
+        if (ggml_is_view_op(node->op) && cur_allocr == NULL) {
+            cur_allocr = node_allocr(node) = node_allocr(node->view_src);
+            SET_CAUSE(node, "3.vsrc");
+        }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -977,81 +1091,100 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             }
             ggml_tallocr_t src_allocr = node_allocr(src);
             if (src_allocr == NULL) {
-                node_allocr(src) = node_allocr;
+                if (src->view_src != NULL) {
+                    // views are always on the same backend as the source
+                    node_allocr(src) = node_allocr(src->view_src);
+                } else {
+                    node_allocr(src) = cur_allocr;
+                }
             }
         }
     }
-    //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS3
+    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
     // pass 4: split graph, find tensors that need to be copied
-    // TODO:
-    //  - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
-    // find first backend
-    int cur_split = 0;
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-        if (node->view_src == NULL) {
-            sched->splits[0].tallocr = node_allocr(node);
-            break;
-        }
-    }
-    sched->splits[0].i_start = 0;
-    sched->splits[0].n_inputs = 0;
-    memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
-    ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
-    size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-    for (int i = 0; i < graph->n_nodes; i++) {
-        struct ggml_tensor * node = graph->nodes[i];
-
-        if (ggml_is_view_op(node->op)) {
-            continue;
-        }
-
-        ggml_tallocr_t node_allocr = node_allocr(node);
-
-        if (node_allocr != cur_allocr) {
-            sched->splits[cur_split].i_end = i;
-            cur_split++;
-            GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
-            sched->splits[cur_split].tallocr = node_allocr;
-            sched->splits[cur_split].i_start = i;
-            sched->splits[cur_split].n_inputs = 0;
-            memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
-            cur_allocr = node_allocr;
-            cur_backend_id = sched_allocr_prio(sched, cur_allocr);
-        }
-
-        // find inputs that are not on the same backend
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * src = node->src[j];
-            if (src == NULL) {
+    {
+        int cur_split = 0;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (node->view_src == NULL) {
+                sched->splits[0].tallocr = node_allocr(node);
                 break;
             }
-            ggml_tallocr_t src_allocr = node_allocr(src);
-            if (src_allocr != node_allocr) {
-                int n_inputs = sched->splits[cur_split].n_inputs++;
-                GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
-                sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
+        }
+        sched->splits[0].i_start = 0;
+        sched->splits[0].n_inputs = 0;
+        memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
+        ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
+        size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
 
-                // create copies
-                size_t id = hash_id(src);
-                if (sched->node_copies[id][cur_backend_id] == NULL) {
-                    struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                    sched->node_copies[id][cur_backend_id] = tensor_copy;
-                    node_allocr(tensor_copy) = cur_allocr;
-                    ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                    ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+
+            ggml_tallocr_t node_allocr = node_allocr(node);
+
+            if (node_allocr != cur_allocr) {
+                sched->splits[cur_split].i_end = i;
+                cur_split++;
+                GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
+                sched->splits[cur_split].tallocr = node_allocr;
+                sched->splits[cur_split].i_start = i;
+                sched->splits[cur_split].n_inputs = 0;
+                memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
+                cur_allocr = node_allocr;
+                cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+            }
+
+            // find inputs that are not on the same backend
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * src = node->src[j];
+                if (src == NULL) {
+                    break;
+                }
+                ggml_tallocr_t src_allocr = node_allocr(src);
+                if (src_allocr != node_allocr) {
+                    // check if the input is already in the split
+                    bool found = false;
+                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
+                        if (sched->splits[cur_split].inputs[k] == src) {
+                            found = true;
+                            break;
+                        }
+                    }
+
+                    if (!found) {
+                        int n_inputs = sched->splits[cur_split].n_inputs++;
+                        //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
+                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+                        sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
+                    }
+
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
                 }
-                node->src[j] = sched->node_copies[id][cur_backend_id];
             }
         }
+        sched->splits[cur_split].i_end = graph->n_nodes;
+        sched->n_splits = cur_split + 1;
     }
-    sched->splits[cur_split].i_end = graph->n_nodes;
-    sched->n_splits = cur_split + 1;
+#ifdef DEBUG_PASS4
+    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
-    //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
-
-#if 1
+#ifndef NDEBUG
     // sanity check: all sources should have the same backend as the node
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -1059,6 +1192,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         if (node_allocr == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
+        if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
+            fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
+                node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+                node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
+        }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -1070,8 +1208,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                     node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
                     j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
             }
+            if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
+                fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
+                    src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
+                    src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
+            }
         }
     }
+    fflush(stderr);
 #endif
 
     // create copies of the graph for each split
@@ -1085,6 +1229,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+            // add a dependency to the input source so that it is not freed before the copy is done
             input_cpy->src[0] = input;
             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
         }
@@ -1121,19 +1266,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
             if (input->buffer == NULL) {
+                GGML_ASSERT(false);
                 if (input->view_src == NULL) {
                     fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
-                    exit(1);
+                    GGML_ASSERT(false);
                 }
                 // FIXME: may need to use the sched buffer instead
                 ggml_backend_view_init(input->view_src->buffer, input);
             }
             if (input_cpy->buffer == NULL) {
                 fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
-                exit(1);
+                GGML_ASSERT(false);
             }
-            //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
-            //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
+            // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
+            // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
             ggml_backend_tensor_copy(input, input_cpy);
         }
         // ggml_backend_synchronize(split_backend);
@@ -1168,13 +1314,23 @@ static void sched_reset(ggml_backend_sched_t sched) {
     for (int i = 0; i < sched->n_backends; i++) {
         ggml_tallocr_reset(sched->tallocs[i]);
     }
+    // reset state for the next run
+    size_t hash_size = sched->hash_set.size;
+    memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
+    memset(sched->node_talloc,   0, sizeof(sched->node_talloc[0])   * hash_size);
+    memset(sched->node_copies,   0, sizeof(sched->node_copies[0])   * hash_size);
 }
 
-ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
+ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
+    GGML_ASSERT(n_backends > 0);
     GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
 
-    struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
-    memset(sched, 0, sizeof(struct ggml_backend_sched));
+    struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
+
+    // initialize hash table
+    sched->hash_set    = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+    sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
+    sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
 
     sched->n_backends = n_backends;
     for (int i = 0; i < n_backends; i++) {
@@ -1199,6 +1355,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
         ggml_tallocr_free(sched->tallocs[i]);
     }
     ggml_gallocr_free(sched->galloc);
+    ggml_free(sched->ctx);
     free(sched->hash_set.keys);
     free(sched->node_talloc);
     free(sched->node_copies);
@@ -1206,12 +1363,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 }
 
 void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
-    // initialize hash tables
-    size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
-    sched->hash_set.size = hash_size;
-    sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
-    sched->node_talloc   = malloc(sizeof(sched->node_talloc[0])   * hash_size);
-    sched->node_copies   = malloc(sizeof(sched->node_copies[0])   * hash_size);
+    GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
 
     sched_split_graph(sched, measure_graph);
     sched_alloc_splits(sched);
@@ -1227,7 +1379,7 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
 }
 
 void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
-    GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+    GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
 
     sched_split_graph(sched, graph);
     sched_alloc_splits(sched);
@@ -1235,13 +1387,19 @@ void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cg
     sched_reset(sched);
 }
 
+int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
+    return sched->n_splits;
+}
+
 ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
     int backend_index = sched_backend_prio(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     return sched->tallocs[backend_index];
 }
 
 ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
     int backend_index = sched_backend_prio(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
     return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
 }
 
@@ -1252,9 +1410,10 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
 }
 
 // utils
+
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
-    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
+    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
     GGML_ASSERT(tensor->view_src != NULL);
     GGML_ASSERT(tensor->view_src->buffer != NULL);
     GGML_ASSERT(tensor->view_src->data != NULL);
@@ -1320,6 +1479,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor
 
     struct ggml_tensor * dst = node_copies[id];
     if (dst->view_src != NULL) {
+        graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
         ggml_backend_view_init(dst->view_src->buffer, dst);
     }
     else {
@@ -1353,6 +1513,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
     struct ggml_context * ctx_allocated = ggml_init(params);
     struct ggml_context * ctx_unallocated = ggml_init(params);
 
+    if (ctx_allocated == NULL || ctx_unallocated == NULL) {
+        fprintf(stderr, "failed to allocate context for graph copy\n");
+        free(hash_set.keys);
+        free(node_copies);
+        free(node_init);
+        ggml_free(ctx_allocated);
+        ggml_free(ctx_unallocated);
+        return (struct ggml_backend_graph_copy) {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
+
     // dup nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
@@ -1361,6 +1536,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
 
     // allocate nodes
     ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+    if (buffer == NULL) {
+        fprintf(stderr, "failed to allocate buffer for graph copy\n");
+        free(hash_set.keys);
+        free(node_copies);
+        free(node_init);
+        ggml_free(ctx_allocated);
+        ggml_free(ctx_unallocated);
+        return (struct ggml_backend_graph_copy) {
+            /* .buffer           = */ NULL,
+            /* .ctx_allocated    = */ NULL,
+            /* .ctx_unallocated  = */ NULL,
+            /* .graph            = */ NULL,
+        };
+    }
 
     //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
 
@@ -1397,8 +1586,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) {
     ggml_free(copy.ctx_unallocated);
 }
 
-void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
+bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
     struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
+    if (copy.buffer == NULL) {
+        return false;
+    }
+
     struct ggml_cgraph * g1 = graph;
     struct ggml_cgraph * g2 = copy.graph;
 
@@ -1428,4 +1621,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
     }
 
     ggml_backend_graph_copy_free(copy);
+
+    return true;
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 85ff67b0e..c4eff546a 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -17,22 +17,32 @@ extern "C" {
     //
 
     // buffer type
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
-    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+    GGML_API const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
 
     // buffer
-    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+    };
+
+    GGML_API const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
 
     //
     // Backend
@@ -140,24 +150,23 @@ extern "C" {
     typedef struct ggml_backend_sched * ggml_backend_sched_t;
 
     // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
-
-    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
-
+    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
+    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
     // Initialize backend buffers from a measure graph
-    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API void                  ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    // Get the number of splits of the last graph
+    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
 
     GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
     GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
 
-    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
 
-    // Allocate a graph on the backend scheduler
+    // Allocate and compute graph on the backend scheduler
     GGML_API void ggml_backend_sched_graph_compute(
             ggml_backend_sched_t sched,
             struct ggml_cgraph * graph);
 
-
     //
     // Utils
     //
@@ -176,7 +185,7 @@ extern "C" {
     typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends
-    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
 
     // Tensor initialization
     GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index e26260a35..b5a4a7349 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -8,8 +8,13 @@
 #include <limits>
 #include <stdint.h>
 #include <stdio.h>
+#include <string>
 #include <vector>
-
+#include <map>
+#include <array>
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
 
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@@ -77,6 +82,7 @@
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -112,10 +118,6 @@
 
 #endif // defined(GGML_USE_HIPBLAS)
 
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
 #define CC_PASCAL     600
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
@@ -553,7 +555,7 @@ static void ggml_cuda_set_device(const int device) {
 
 static int g_device_count = -1;
 static int g_main_device = 0;
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
 
 struct cuda_device_capabilities {
     int     cc;                 // compute capability
@@ -564,10 +566,6 @@ struct cuda_device_capabilities {
 
 static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
 
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 0; // disabled by default
-static size_t g_scratch_offset = 0;
-
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
 [[noreturn]]
@@ -7329,8 +7327,9 @@ void ggml_init_cublas() {
             CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
             fprintf(stderr, "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
 
-            g_tensor_split[id] = total_vram;
+            g_default_tensor_split[id] = total_vram;
             total_vram += prop.totalGlobalMem;
+
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
             g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 #else
@@ -7339,7 +7338,7 @@ void ggml_init_cublas() {
             g_device_caps[id].smpb = prop.sharedMemPerBlock;
         }
         for (int id = 0; id < g_device_count; ++id) {
-            g_tensor_split[id] /= total_vram;
+            g_default_tensor_split[id] /= total_vram;
         }
 
         for (int id = 0; id < g_device_count; ++id) {
@@ -7363,30 +7362,6 @@ void ggml_init_cublas() {
     }
 }
 
-void ggml_cuda_set_tensor_split(const float * tensor_split) {
-    if (tensor_split == nullptr) {
-        return;
-    }
-    bool all_zero = true;
-    for (int i = 0; i < g_device_count; ++i) {
-        if (tensor_split[i] != 0.0f) {
-            all_zero = false;
-            break;
-        }
-    }
-    if (all_zero) {
-        return;
-    }
-    float split_sum = 0.0f;
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] = split_sum;
-        split_sum += tensor_split[i];
-    }
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] /= split_sum;
-    }
-}
-
 void * ggml_cuda_host_malloc(size_t size) {
     if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
         return nullptr;
@@ -7838,11 +7813,11 @@ static void ggml_cuda_op_mul_mat_q(
     (void) src1_ddf_i;
 }
 
-static int64_t get_row_rounding(ggml_type type) {
+static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
     int64_t min_compute_capability = INT_MAX;
     int64_t max_compute_capability = INT_MIN;
     for (int id = 0; id < g_device_count; ++id) {
-        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+        if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
             if (min_compute_capability > g_device_caps[id].cc) {
                 min_compute_capability = g_device_caps[id].cc;
             }
@@ -7901,6 +7876,21 @@ static int64_t get_row_rounding(ggml_type type) {
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }
 
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
+    const int64_t nrows = ggml_nrows(tensor);
+    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+
+    *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+    *row_low -= *row_low % rounding;
+
+    if (id == g_device_count - 1) {
+        *row_high = nrows;
+    } else {
+        *row_high = nrows*tensor_split[id + 1];
+        *row_high -= *row_high % rounding;
+    }
+}
+
 static void ggml_cuda_op_mul_mat_vec_q(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
@@ -8515,6 +8505,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
     peer_access_enabled = enable_peer_access;
 }
 
+// FIXME: move this somewhere else
+struct ggml_backend_cuda_split_buffer_type_context {
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+};
+
 static void ggml_cuda_op_mul_mat(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
     const bool convert_src1_to_q8_1) {
@@ -8566,6 +8561,14 @@ static void ggml_cuda_op_mul_mat(
     GGML_ASSERT(!(split && ne03 > 1));
     GGML_ASSERT(!(split && ne02 < ne12));
 
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+    if (split) {
+        // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
+        // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+        tensor_split = buft_ctx->tensor_split;
+    }
+
     struct dev_data {
         cuda_pool_alloc<char>  src0_dd_alloc;
         cuda_pool_alloc<float> src1_ddf_alloc;
@@ -8593,17 +8596,17 @@ static void ggml_cuda_op_mul_mat(
         // for multi GPU, get the row boundaries from tensor split
         // and round to mul_mat_q tile sizes
         if (split) {
-            const int64_t rounding = get_row_rounding(src0->type);
+            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
 
             if (id != 0) {
-                dev[id].row_low  = ne01*g_tensor_split[id];
+                dev[id].row_low  = ne01*tensor_split[id];
                 if (dev[id].row_low < ne01) {
                     dev[id].row_low -= dev[id].row_low % rounding;
                 }
             }
 
             if (id != g_device_count - 1) {
-                dev[id].row_high  = ne01*g_tensor_split[id + 1];
+                dev[id].row_high  = ne01*tensor_split[id + 1];
                 if (dev[id].row_high < ne01) {
                     dev[id].row_high -= dev[id].row_high % rounding;
                 }
@@ -9149,10 +9152,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
 
     int64_t min_compute_capability = INT_MAX;
-    for (int id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            min_compute_capability = g_device_caps[id].cc;
+
+    if (split) {
+        ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+        auto & tensor_split = buft_ctx->tensor_split;
+        for (int id = 0; id < g_device_count; ++id) {
+            if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+                min_compute_capability = g_device_caps[id].cc;
+            }
         }
+    } else {
+        min_compute_capability = g_device_caps[g_main_device].cc;
     }
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
@@ -9191,7 +9201,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+    } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // KQ + KQV multi-batch
         ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
@@ -9653,247 +9663,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
     return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 
-void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
-    const int64_t nrows = ggml_nrows(tensor);
-
-    const int64_t ne0 = tensor->ne[0];
-
-    const size_t nb1 = tensor->nb[1];
-
-    ggml_backend_type backend = tensor->backend;
-    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
-
-    for (int id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
-            continue;
-        }
-
-        ggml_cuda_set_device(id);
-
-        int64_t row_low, row_high;
-        if (backend == GGML_BACKEND_GPU) {
-            row_low = 0;
-            row_high = nrows;
-        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
-            const int64_t rounding = get_row_rounding(tensor->type);
-
-            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
-            row_low -= row_low % rounding;
-
-            if (id == g_device_count - 1) {
-                row_high = nrows;
-            } else {
-                row_high = nrows*g_tensor_split[id + 1];
-                row_high -= row_high % rounding;
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-        if (row_low == row_high) {
-            continue;
-        }
-
-        int64_t nrows_split = row_high - row_low;
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf;
-        CUDA_CHECK(cudaMalloc(&buf, size));
-        char * buf_host = (char *)data + offset_split;
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
-        }
-
-        CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
-
-        extra->data_device[id] = buf;
-
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
-            }
-        }
-    }
-
-    tensor->extra = extra;
-}
-
-void ggml_cuda_free_data(struct ggml_tensor * tensor) {
-    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    for (int id = 0; id < g_device_count; ++id) {
-        ggml_cuda_set_device(id);
-        if (extra->data_device[id] != nullptr) {
-            CUDA_CHECK(cudaFree(extra->data_device[id]));
-        }
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            if (extra->events[id][is] != nullptr) {
-                CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
-            }
-        }
-    }
-
-    delete extra;
-}
-
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
-static size_t g_temp_tensor_extra_index = 0;
-
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-    if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-    }
-
-    size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
-    memset(extra, 0, sizeof(*extra));
-
-    return extra;
-}
-
-static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
-    if (scratch && g_scratch_size == 0) {
-        return;
-    }
-
-    tensor->backend = GGML_BACKEND_GPU;
-
-    // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
-        }
-    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
-    }
-
-    if (scratch && no_alloc) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra;
-
-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
-        tensor->op == GGML_OP_VIEW ||
-        force_inplace;
-    const size_t size = ggml_nbytes(tensor);
-
-    ggml_cuda_set_device(g_main_device);
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->op_params, sizeof(size_t));
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src0_ddc + offset;
-    } else if (tensor->op == GGML_OP_CPY) {
-        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
-        void * src1_ddv = src1_extra->data_device[g_main_device];
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src1_ddv;
-    } else if (scratch) {
-        GGML_ASSERT(size <= g_scratch_size);
-        if (g_scratch_offset + size > g_scratch_size) {
-            g_scratch_offset = 0;
-        }
-
-        char * data = (char *) g_scratch_buffer;
-        if (data == nullptr) {
-            CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
-            g_scratch_buffer = data;
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = data + g_scratch_offset;
-
-        g_scratch_offset += size;
-
-        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
-    } else { // allocate new buffers outside of scratch
-        void * data;
-        CUDA_CHECK(cudaMalloc(&data, size));
-        CUDA_CHECK(cudaMemset(data, 0, size));
-        extra = new ggml_tensor_extra_gpu;
-        memset(extra, 0, sizeof(*extra));
-        extra->data_device[g_main_device] = data;
-    }
-
-    tensor->extra = extra;
-}
-
-void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
-    if (g_scratch_size == 0) {
-        return;
-    }
-    if (g_scratch_buffer == nullptr) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
-    }
-
-    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
-
-    const bool inplace = tensor->view_src != nullptr;
-
-    if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t view_offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
-        }
-        extra->data_device[g_main_device] = src0_ddc + view_offset;
-    } else {
-        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
-    }
-
-    tensor->extra = extra;
-}
-
-void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(ggml_is_contiguous(tensor));
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    ggml_cuda_set_device(g_main_device);
-    CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
-}
-
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
-}
-
-void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
-}
-
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
-}
-
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
-}
-
-void ggml_cuda_set_main_device(const int main_device) {
+static void ggml_cuda_set_main_device(const int main_device) {
     if (main_device >= g_device_count) {
         fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                 main_device, g_device_count, g_main_device);
@@ -9902,30 +9672,12 @@ void ggml_cuda_set_main_device(const int main_device) {
 
     if (g_main_device != main_device && g_device_count > 1) {
         g_main_device = main_device;
-        cudaDeviceProp prop;
-        CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
-        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
+        //cudaDeviceProp prop;
+        //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+        //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
     }
 }
 
-void ggml_cuda_set_scratch_size(const size_t scratch_size) {
-    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
-    // it still won't always work as expected, but it's better than nothing
-    if (scratch_size > g_scratch_size) {
-        ggml_cuda_free_scratch();
-    }
-    g_scratch_size = std::max(g_scratch_size, scratch_size);
-}
-
-void ggml_cuda_free_scratch() {
-    if (g_scratch_buffer == nullptr) {
-        return;
-    }
-
-    CUDA_CHECK(cudaFree(g_scratch_buffer));
-    g_scratch_buffer = nullptr;
-}
-
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     if (!g_cublas_loaded) return false;
 
@@ -10104,6 +9856,11 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
 
 #define UNUSED GGML_UNUSED
 
+struct ggml_backend_context_cuda {
+    int device;
+    std::string name;
+};
+
 // cuda buffer
 
 struct ggml_backend_buffer_context_cuda {
@@ -10111,8 +9868,12 @@ struct ggml_backend_buffer_context_cuda {
     void * dev_ptr = nullptr;
     ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
     size_t temp_tensor_extra_index = 0;
+    std::string name;
 
-    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) :
+        device(device), dev_ptr(dev_ptr),
+        name(GGML_CUDA_NAME + std::to_string(device)) {
+    }
 
     ~ggml_backend_buffer_context_cuda() {
         delete[] temp_tensor_extras;
@@ -10132,6 +9893,11 @@ struct ggml_backend_buffer_context_cuda {
     }
 };
 
+static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->name.c_str();
+}
+
 static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
     CUDA_CHECK(cudaFree(ctx->dev_ptr));
@@ -10173,8 +9939,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
             CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
         }
     }
-
-    UNUSED(buffer);
 }
 
 static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -10184,8 +9948,8 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
 
     ggml_cuda_set_device(ctx->device);
     CUDA_CHECK(cudaDeviceSynchronize());
-
     CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -10195,7 +9959,6 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
 
     ggml_cuda_set_device(ctx->device);
     CUDA_CHECK(cudaDeviceSynchronize());
-
     CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
 }
 
@@ -10204,11 +9967,12 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
 
     ggml_cuda_set_device(ctx->device);
     CUDA_CHECK(cudaDeviceSynchronize());
-
     CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cuda_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
@@ -10217,23 +9981,39 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
     /* .cpy_tensor_from = */ NULL,
     /* .cpy_tensor_to   = */ NULL,
     /* .clear           = */ ggml_backend_cuda_buffer_clear,
+    /* .reset           = */ NULL,
 };
 
 // cuda buffer type
 
-static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    int device = (int) (intptr_t) buft->context;
+struct ggml_backend_cuda_buffer_type_context {
+    int device;
+    std::string name;
+};
 
-    ggml_cuda_set_device(device);
+static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+    ggml_cuda_set_device(buft_ctx->device);
 
     size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
 
     void * dev_ptr;
-    CUDA_CHECK(cudaMalloc(&dev_ptr, size));
+    cudaError_t err = cudaMalloc(&dev_ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
+        return nullptr;
+    }
 
-    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
 
-    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }
 
 static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -10242,7 +10022,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     int64_t row_low = 0;
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
@@ -10263,21 +10043,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
 }
 
 static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
+    if (!ggml_backend_is_cuda(backend)) {
+        return false;
+    }
 
-    UNUSED(buft);
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    return buft_ctx->device == cuda_ctx->device;
 }
 
 static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
     /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-    /* .is_host          = */ nullptr,
+    /* .is_host          = */ NULL,
 };
 
 ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+    // FIXME: this is not thread safe
+    if (device >= ggml_backend_cuda_get_device_count()) {
+        return nullptr;
+    }
+
+    static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
 
     static bool ggml_backend_cuda_buffer_type_initialized = false;
 
@@ -10285,7 +10076,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
         for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
             ggml_backend_cuda_buffer_types[i] = {
                 /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
-                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+                /* .context  = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
             };
         }
         ggml_backend_cuda_buffer_type_initialized = true;
@@ -10294,8 +10085,298 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
     return &ggml_backend_cuda_buffer_types[device];
 }
 
+// cuda split buffer
+
+struct ggml_backend_cuda_split_buffer_context {
+    ~ggml_backend_cuda_split_buffer_context() {
+        for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+            for (int id = 0; id < g_device_count; ++id) {
+                for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                    CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+                }
+                CUDA_CHECK(cudaFree(extra->data_device[id]));
+            }
+            delete extra;
+        }
+    }
+
+    std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+};
+
+static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return GGML_CUDA_NAME "_Split";
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+    // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
+    return (void *)0x1000;
+
+    UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+    ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+
+    ctx->tensor_extras.push_back(extra);
+
+    for (int id = 0; id < g_device_count; ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        // FIXME: do not crash if cudaMalloc fails
+        // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+        ggml_cuda_set_device(id);
+        char * buf;
+        CUDA_CHECK(cudaMalloc(&buf, size));
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+        }
+
+        extra->data_device[id] = buf;
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+        }
+    }
+    tensor->backend = GGML_BACKEND_GPU_SPLIT;
+    tensor->extra = extra;
+}
+
+static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int id = 0; id < g_device_count; ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        const char * buf_host = (const char *)data + offset_split;
+        CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
+    }
+}
+
+static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    // split tensors must always be set in their entirety at once
+    GGML_ASSERT(offset == 0);
+    GGML_ASSERT(size == ggml_nbytes(tensor));
+
+    ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+    const int64_t ne0 = tensor->ne[0];
+    const size_t nb1 = tensor->nb[1];
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+    for (int id = 0; id < g_device_count; ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf_host = (char *)data + offset_split;
+        CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
+    }
+}
+
+static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    UNUSED(buffer);
+    UNUSED(value);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
+    /* .get_name        = */ ggml_backend_cuda_split_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_cuda_split_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_split_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_split_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_split_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_split_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+    /* .clear           = */ ggml_backend_cuda_split_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// cuda split buffer type
+
+static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_CUDA_NAME "_Split";
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+    // instead, we allocate them for each tensor separately in init_tensor
+    // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+    // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+    ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
+
+    return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+
+    size_t total_size = 0;
+
+    const int64_t ne0 = tensor->ne[0];
+
+    for (int id = 0; id < g_device_count; ++id) {
+        int64_t row_low, row_high;
+        get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+
+        int64_t nrows_split = row_high - row_low;
+        if (nrows_split == 0) {
+            continue;
+        }
+
+        total_size += ggml_nbytes_split(tensor, nrows_split);
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return total_size;
+}
+
+static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_cuda(backend);
+
+    UNUSED(buft);
+}
+
+static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return false;
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_cuda_split_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_split_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
+    /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
+    /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+    // FIXME: this is not thread safe
+    static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+
+    std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
+
+    bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
+    if (all_zero) {
+        tensor_split_arr = g_default_tensor_split;
+    } else {
+        float split_sum = 0.0f;
+        for (int i = 0; i < g_device_count; ++i) {
+            tensor_split_arr[i] = split_sum;
+            split_sum += tensor_split[i];
+        }
+        for (int i = 0; i < g_device_count; ++i) {
+            tensor_split_arr[i] /= split_sum;
+        }
+    }
+
+    auto it = buft_map.find(tensor_split_arr);
+    if (it != buft_map.end()) {
+        return &it->second;
+    }
+
+    struct ggml_backend_buffer_type buft {
+        /* .iface   = */ ggml_backend_cuda_split_buffer_type_interface,
+        /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
+    };
+
+    auto result = buft_map.emplace(tensor_split_arr, buft);
+    return &result.first->second;
+}
+
 // host buffer type
 
+static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_CUDA_NAME "_Host";
+
+    UNUSED(buft);
+}
+
+static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return GGML_CUDA_NAME "_Host";
+
+    UNUSED(buffer);
+}
+
 static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_cuda_host_free(buffer->context);
 }
@@ -10308,9 +10389,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
         return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
     }
 
-    // FIXME: this is a hack to avoid having to implement a new buffer type
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
     buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
 
     return buffer;
@@ -10319,6 +10400,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
 ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
         /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
             /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -10333,14 +10415,10 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
 
 // backend
 
-struct ggml_backend_context_cuda {
-    int device;
-};
-
 static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    return GGML_CUDA_NAME;
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
 
-    UNUSED(backend);
+    return cuda_ctx->name.c_str();
 }
 
 static void ggml_backend_cuda_free(ggml_backend_t backend) {
@@ -10382,29 +10460,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     UNUSED(backend);
 }
 
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    GGML_ASSERT(!"not implemented");
-
-    return nullptr;
-
-    UNUSED(backend);
-    UNUSED(cgraph);
-}
-
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
 static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
 
@@ -10419,46 +10474,25 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
         if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
             continue;
 
-        assert(node->backend == GGML_BACKEND_GPU);
+#ifndef NDEBUG
+        assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
         assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
         assert(node->extra != nullptr);
 
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
+                //assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
                 assert(node->src[j]->extra != nullptr);
             }
         }
+#endif
 
         bool ok = ggml_cuda_compute_forward(&params, node);
         if (!ok) {
             fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
-
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
     }
 
     UNUSED(backend);
@@ -10577,7 +10611,7 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
     UNUSED(backend);
 }
 
-static ggml_backend_i cuda_backend_i = {
+static ggml_backend_i ggml_backend_cuda_interface = {
     /* .get_name                = */ ggml_backend_cuda_name,
     /* .free                    = */ ggml_backend_cuda_free,
     /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
@@ -10586,9 +10620,9 @@ static ggml_backend_i cuda_backend_i = {
     /* .cpy_tensor_from_async   = */ NULL,
     /* .cpy_tensor_to_async     = */ NULL,
     /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
     /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
 };
@@ -10605,11 +10639,12 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
     ggml_cuda_set_main_device(device);
 
     ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
-        /* .device = */ device
+        /* .device = */ device,
+        /* .name   = */ GGML_CUDA_NAME + std::to_string(device),
     };
 
     ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ cuda_backend_i,
+        /* .interface = */ ggml_backend_cuda_interface,
         /* .context   = */ ctx
     };
 
@@ -10617,9 +10652,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
 }
 
 bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cuda_name;
+    return backend && backend->iface.get_name == ggml_backend_cuda_name;
 }
 
+int ggml_backend_cuda_get_device_count() {
+    return ggml_cuda_get_device_count();
+}
+
+void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
+    ggml_cuda_get_device_description(device, description, description_size);
+}
+
+void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
+    ggml_cuda_set_device(device);
+
+    CUDA_CHECK(cudaMemGetInfo(free, total));
+}
+
+// backend registry
 static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
     ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
     return cuda_backend;
diff --git a/ggml-cuda.h b/ggml-cuda.h
index cdb0c0c41..d19cbf3fd 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);
 
 GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
-GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
-GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_set_main_device(int main_device);
-GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
-GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
-GGML_API void   ggml_cuda_free_scratch(void);
 GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
 GGML_API int    ggml_cuda_get_device_count(void);
@@ -52,13 +36,17 @@ GGML_API void   ggml_cuda_get_device_description(int device, char * description,
 GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
 
 GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
-GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
 
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-
-// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
+GGML_API int  ggml_backend_cuda_get_device_count(void);
+GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml-impl.h b/ggml-impl.h
index 2faced080..2c58075ac 100644
--- a/ggml-impl.h
+++ b/ggml-impl.h
@@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 
+struct ggml_hash_set ggml_hash_set_new(size_t size);
+
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
diff --git a/ggml-metal.m b/ggml-metal.m
index 6c2a8d04e..547b618b4 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2482,10 +2482,10 @@ static void ggml_backend_metal_free_device(void) {
     }
 }
 
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
-    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return "Metal";
 
-    return ctx->all_data;
+    UNUSED(buffer);
 }
 
 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -2503,6 +2503,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     free(ctx);
 }
 
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    return ctx->all_data;
+}
+
 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
@@ -2515,13 +2521,13 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
+static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
+static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
 
     UNUSED(buffer);
@@ -2534,6 +2540,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
 }
 
 static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
+    /* .get_name        = */ ggml_backend_metal_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_metal_buffer_get_base,
     /* .init_tensor     = */ NULL,
@@ -2542,10 +2549,17 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
     /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
     /* .cpy_tensor_to   = */ ggml_backend_metal_buffer_cpy_tensor_to,
     /* .clear           = */ ggml_backend_metal_buffer_clear,
+    /* .reset           = */ NULL,
 };
 
 // default buffer type
 
+static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    return "Metal";
+
+    UNUSED(buft);
+}
+
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
@@ -2618,6 +2632,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu
 ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
         /* .iface = */ {
+            /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
             /* .alloc_buffer     = */ ggml_backend_metal_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
             /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
@@ -2641,6 +2656,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
     ctx->n_buffers = 0;
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
+
+    // page-align the data ptr
+    {
+        const uintptr_t offs = (uintptr_t) data % size_page;
+        data  = (void *) ((char *) data - offs);
+        size += offs;
+    }
+
     size_t size_aligned = size;
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
@@ -2741,7 +2764,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct
     UNUSED(backend);
 }
 
-static struct ggml_backend_i metal_backend_i = {
+static struct ggml_backend_i ggml_backend_metal_i = {
     /* .get_name                = */ ggml_backend_metal_name,
     /* .free                    = */ ggml_backend_metal_free,
     /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
@@ -2767,7 +2790,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
     ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
 
     *metal_backend = (struct ggml_backend) {
-        /* .interface = */ metal_backend_i,
+        /* .interface = */ ggml_backend_metal_i,
         /* .context   = */ ctx,
     };
 
@@ -2775,7 +2798,7 @@ ggml_backend_t ggml_backend_metal_init(void) {
 }
 
 bool ggml_backend_is_metal(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_metal_name;
+    return backend && backend->iface.get_name == ggml_backend_metal_name;
 }
 
 void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 496f9cdca..cfa766eb1 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "ggml-opencl.h"
+#include "ggml-backend-impl.h"
 
 #include <array>
 #include <atomic>
@@ -10,7 +11,7 @@
 #include <sstream>
 #include <vector>
 
-#define CL_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION 120
 #include <clblast.h>
 
 #if defined(_MSC_VER)
@@ -929,6 +930,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
 }
 
 void ggml_cl_init(void) {
+    static bool initialized = false;
+    if (initialized) {
+        return;
+    }
+
     cl_int err;
 
     struct cl_device;
@@ -1483,8 +1489,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     } else {
         d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
     }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D =  dst->backend == GGML_BACKEND_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
 
     size_t x_offset = 0;
 
@@ -1501,7 +1507,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
 
                 for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                     // copy src1 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+                    if (src1->backend == GGML_BACKEND_CPU) {
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+                    }
 
                     CL_CHECK(clFinish(queue));
 
@@ -1522,8 +1530,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                     }
 
                     // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                    }
                 }
             }
         }
@@ -1532,8 +1542,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
     if (src0->backend != GGML_BACKEND_GPU) {
         ggml_cl_pool_free(d_X, x_size);
     }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
+    if (src1->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_Y, y_size);
+    }
+    if (dst->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_D, d_size);
+    }
 }
 
 static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@@ -1598,6 +1612,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                     CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                 }
 
+                // FIXME: convert on device
+
                 for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                     // convert src1 to fp16
                     // TODO: use multiple threads
@@ -1643,11 +1659,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                     }
 
                     // copy dst to host, then convert to float
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
-                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
+                    } else {
+                        // FIXME: convert dst to fp32 on device
+                    }
                 }
             }
         }
@@ -1801,7 +1819,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
 }
 
 
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
     const int64_t ne10 = src1->ne[0];
 
     const int64_t ne0 = dst->ne[0];
@@ -1895,3 +1913,292 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
     tensor->extra = dst;
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
+
+// ggml-backend
+
+// buffer
+
+struct ggml_backend_opencl_buffer_context {
+    ~ggml_backend_opencl_buffer_context() {
+        if (buffer) {
+            clReleaseMemObject(buffer);
+        }
+        for (auto * sub_buffer : sub_buffers) {
+            clReleaseMemObject(sub_buffer);
+        }
+    }
+
+    cl_mem buffer;
+    std::vector<cl_mem> sub_buffers;
+};
+
+static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+
+static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return cl_ptr_base;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        tensor->extra = tensor->view_src->extra;
+    } else {
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
+        cl_int err;
+        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+        ctx->sub_buffers.push_back(sub_buffer);
+        tensor->extra = sub_buffer;
+    }
+    tensor->backend = GGML_BACKEND_GPU;
+}
+
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    cl_mem tensor_buffer = (cl_mem) tensor->extra;
+    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    cl_mem tensor_buffer = (cl_mem) tensor->extra;
+    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+}
+
+static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    for (auto * sub_buffer : ctx->sub_buffers) {
+        clReleaseMemObject(sub_buffer);
+    }
+    ctx->sub_buffers.clear();
+}
+
+static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
+    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+    /* .clear           = */ ggml_backend_opencl_buffer_clear,
+    /* .reset           = */ ggml_backend_opencl_buffer_reset,
+};
+
+// buffer type
+
+static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_cl_init();
+
+    cl_int err;
+    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
+        return nullptr;
+    }
+
+    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
+
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+    // FIXME: not thread safe, device may not be initialized yet
+    static cl_uint alignment = -1;
+    if (alignment == (cl_uint)-1) {
+        ggml_cl_init();
+        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
+    }
+    return alignment;
+
+    GGML_UNUSED(buffer_type);
+}
+
+static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
+    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
+    return ggml_backend_is_cpu(backend);
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ NULL,
+    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
+    /* .is_host          = */ NULL,
+};
+
+
+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+    static ggml_backend_buffer_type buffer_type = {
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .context = */ nullptr,
+    };
+
+    return &buffer_type;
+}
+
+#if 0
+// host buffer type
+
+static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "CL_Host";
+
+    GGML_UNUSED(buft);
+}
+
+static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CL_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_cl_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_cl_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
+    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_opencl_buffer_type_host;
+}
+
+// backend
+
+static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
+    return "OpenCL";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_opencl_free(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_opencl_buffer_type();
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        ggml_tensor * node = graph->nodes[i];
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
+                break;
+            case GGML_OP_MUL:
+                ggml_cl_mul(node->src[0], node->src[1], node);
+                break;
+            default:
+                GGML_ASSERT(false);
+        }
+    }
+
+    return true;
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
+        case GGML_OP_MUL:
+            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
+            return true;
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static ggml_backend_i opencl_backend_i = {
+    /* .get_name                = */ ggml_backend_opencl_name,
+    /* .free                    = */ ggml_backend_opencl_free,
+    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
+    /* .supports_op             = */ ggml_backend_opencl_supports_op,
+};
+
+ggml_backend_t ggml_backend_opencl_init() {
+    ggml_backend_t backend = new ggml_backend {
+        /* .interface = */ opencl_backend_i,
+        /* .context   = */ nullptr
+    };
+
+    return backend;
+}
+
+bool ggml_backend_is_opencl(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_opencl_name;
+}
+#endif
diff --git a/ggml-opencl.h b/ggml-opencl.h
index 44d05bd64..919b00d63 100644
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -9,17 +10,26 @@ extern "C" {
 GGML_API void ggml_cl_init(void);
 
 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 
-GGML_API void * ggml_cl_host_malloc(size_t size);
-GGML_API void   ggml_cl_host_free(void * ptr);
+// GGML_API void * ggml_cl_host_malloc(size_t size);
+// GGML_API void   ggml_cl_host_free(void * ptr);
 
 GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
 
 GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 
+// backend API
+
+// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
+
+// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml.c b/ggml.c
index adb387100..1027fabdb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -2336,6 +2336,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 }
 
 void ggml_free(struct ggml_context * ctx) {
+    if (ctx == NULL) {
+        return;
+    }
+
     // make this function thread safe
     ggml_critical_section_start();
 
@@ -4351,6 +4355,23 @@ struct ggml_tensor * ggml_cpy_inplace(
     return ggml_cpy_impl(ctx, a, b, true);
 }
 
+struct ggml_tensor * ggml_cast(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum   ggml_type      type) {
+    bool is_node = false;
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
+    ggml_format_name(result, "%s (copy)", a->name);
+
+    result->op   = GGML_OP_CPY;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    result->src[1] = result;
+
+    return result;
+}
+
 // ggml_cont
 
 static struct ggml_tensor * ggml_cont_impl(
@@ -14851,7 +14872,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
     return i;
 }
 
-static struct ggml_hash_set ggml_hash_set_new(size_t size) {
+struct ggml_hash_set ggml_hash_set_new(size_t size) {
     size = ggml_hash_size(size);
     struct ggml_hash_set result;
     result.size = size;
@@ -16600,7 +16621,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return GGML_EXIT_SUCCESS;
 }
 
-struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
+struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) {
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
@@ -16662,14 +16683,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                 } break;
             case GGML_OP_MUL_MAT_ID:
                 {
+                    cur = 0;
                     const struct ggml_tensor * src0 = node->src[2];
                     const struct ggml_tensor * src1 = node->src[1];
                     const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
                     if (src1->type != vec_dot_type) {
-                        cur = ggml_row_size(vec_dot_type, ggml_nelements(src1));
+                        cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
                     }
                     const int n_as = ggml_get_op_params_i32(node, 1);
-                    cur = GGML_PAD(cur, sizeof(int64_t));        // align
+                    cur += GGML_PAD(cur, sizeof(int64_t));       // align
                     cur += n_as * sizeof(int64_t);               // matrix_row_counts
                     cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows
                 } break;
diff --git a/ggml.h b/ggml.h
index c55e598b4..2013a73d1 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1167,6 +1167,11 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum   ggml_type      type);
+
     // make contiguous
     GGML_API struct ggml_tensor * ggml_cont(
             struct ggml_context * ctx,
@@ -1849,8 +1854,8 @@ extern "C" {
 
     // ggml_graph_plan() has to be called before ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
 
     // same as ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
diff --git a/llama.cpp b/llama.cpp
index 618e47cf0..3f2ae956f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1,5 +1,4 @@
 #define LLAMA_API_INTERNAL
-//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
 #include "llama.h"
 
 #include "unicode.h"
@@ -154,10 +153,6 @@ static bool is_float_close(float a, float b, float abs_tol) {
     return std::fabs(b - a) <= abs_tol;
 }
 
-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
 static void zeros(std::ofstream & file, size_t n) {
     char zero = 0;
     for (size_t i = 0; i < n; ++i) {
@@ -1249,12 +1244,6 @@ struct llama_mlock {
 #endif
 };
 
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-
-static void ggml_offload_nop(struct ggml_tensor * tensor) {
-    (void) tensor;
-}
-
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
     std::vector<char> result(8, 0);
     const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
@@ -1270,19 +1259,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
     return std::string(result.data(), result.size());
 }
 
-static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
+static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
     ggml_backend_buffer_type_t buft = nullptr;
 
-#ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        buft = ggml_backend_metal_buffer_type();
+#if defined(GGML_USE_CUBLAS)
+    // host buffers should only be used when data is expected to be copied to/from the GPU
+    if (host_buffer) {
+        buft = ggml_backend_cuda_host_buffer_type();
     }
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    if (n_gpu_layers > 0) {
-        buft = ggml_backend_cuda_buffer_type(0);
-    }
-#elif defined(GGML_USE_CUBLAS)
-    buft = ggml_backend_cuda_host_buffer_type();
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
 #endif
@@ -1290,10 +1274,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
     if (buft == nullptr) {
         buft = ggml_backend_cpu_buffer_type();
     }
-
     return buft;
 
-    GGML_UNUSED(n_gpu_layers);
+    GGML_UNUSED(host_buffer);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_METAL
+    buft = ggml_backend_metal_buffer_type();
+#elif defined(GGML_USE_CUBLAS)
+    buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_CLBLAST)
+    buft = ggml_backend_opencl_buffer_type();
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_cpu(true);
+    }
+    return buft;
+
+    GGML_UNUSED(gpu);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
+    ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_CUBLAS
+    if (ggml_backend_cuda_get_device_count() > 1) {
+        buft = ggml_backend_cuda_split_buffer_type(tensor_split);
+    }
+#endif
+
+    if (buft == nullptr) {
+        buft = llama_default_buffer_type_offload(fallback_gpu);
+    }
+    return buft;
+
+    GGML_UNUSED(tensor_split);
 }
 
 //
@@ -1504,24 +1523,24 @@ struct llama_kv_cache {
     std::vector<struct ggml_tensor *> k_l; // per layer
     std::vector<struct ggml_tensor *> v_l;
 
-    struct ggml_context * ctx = NULL;
+    std::vector<struct ggml_context *> ctxs;
+    std::vector<ggml_backend_buffer_t> bufs;
 
-    ggml_backend_buffer_t buf = NULL;
+    size_t total_size() const {
+        size_t size = 0;
+        for (ggml_backend_buffer_t buf : bufs) {
+            size += ggml_backend_buffer_get_size(buf);
+        }
+        return size;
+    }
 
     ~llama_kv_cache() {
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-        if (ggml_cublas_loaded()) {
-            for (size_t i = 0; i < k_l.size(); ++i) {
-                ggml_cuda_free_data(k_l[i]);
-                ggml_cuda_free_data(v_l[i]);
-            }
-        }
-#endif
-        if (ctx) {
+        for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
-
-        ggml_backend_buffer_free(buf);
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
     }
 };
 
@@ -1598,16 +1617,32 @@ struct llama_model {
 
     std::vector<llama_layer> layers;
 
+    llama_split_mode split_mode;
+    int main_gpu;
     int n_gpu_layers;
 
     // gguf metadata
     std::unordered_map<std::string, std::string> gguf_kv;
 
-    // context
-    struct ggml_context * ctx = NULL;
+    // layer -> buffer type mapping
+    struct layer_buft {
+        layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
+        layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
+        layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
 
-    // the model memory buffer
-    ggml_backend_buffer_t buf = NULL;
+        ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
+        ggml_backend_buffer_type_t buft;        // everything else
+    };
+
+    layer_buft buft_input;
+    layer_buft buft_output;
+    std::vector<layer_buft> buft_layer;
+
+    // contexts where the model tensors metadata is stored
+    std::vector<struct ggml_context *> ctxs;
+
+    // the model memory buffers for the tensor data
+    std::vector<ggml_backend_buffer_t> bufs;
 
     // model memory mapped file
     std::unique_ptr<llama_mmap> mapping;
@@ -1623,39 +1658,32 @@ struct llama_model {
     int64_t t_start_us = 0;
 
     ~llama_model() {
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-        if (ggml_cublas_loaded()) {
-            for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-                ggml_cuda_free_data(tensors_by_name[i].second);
-            }
-            ggml_cuda_free_scratch();
-        }
-#endif
-
-#if defined(GGML_USE_CLBLAST)
-        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
-            ggml_cl_free_data(tensors_by_name[i].second);
-        }
-#endif
-        if (ctx) {
+        for (struct ggml_context * ctx : ctxs) {
             ggml_free(ctx);
         }
-
-        ggml_backend_buffer_free(buf);
+        for (ggml_backend_buffer_t buf : bufs) {
+            ggml_backend_buffer_free(buf);
+        }
     }
 };
 
 struct llama_context {
     llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
     ~llama_context() {
-        ggml_allocr_free(alloc);
-        ggml_backend_buffer_free(buf_alloc);
-        ggml_backend_free(backend);
+        ggml_backend_sched_free(sched);
+
+        for (ggml_backend_t backend : backends) {
+            ggml_backend_free(backend);
+        }
     }
 
     llama_cparams cparams;
 
-    ggml_backend_t backend = nullptr;
+    std::vector<ggml_backend_t> backends;
+#ifdef GGML_USE_METAL
+    ggml_backend_t backend_metal = nullptr;
+#endif
+    ggml_backend_t backend_cpu = nullptr;
 
     const llama_model & model;
 
@@ -1689,8 +1717,9 @@ struct llama_context {
 
     // memory buffers used to evaluate the model
     std::vector<uint8_t> buf_compute_meta;
-    ggml_backend_buffer_t buf_alloc = NULL;
-    ggml_allocr * alloc = NULL;
+    ggml_backend_sched_t sched = nullptr;
+    // allocator for the input tensors
+    ggml_tallocr * alloc = nullptr;
 
 // TODO(jared): remove this
 #if defined(GGML_USE_KOMPUTE)
@@ -1710,16 +1739,17 @@ struct llama_context {
 //
 
 static bool llama_kv_cache_init(
-        const struct llama_hparams & hparams,
              struct llama_kv_cache & cache,
+                 const llama_model & model,
                          ggml_type   ktype,
                          ggml_type   vtype,
                           uint32_t   n_ctx,
-                               int   n_gpu_layers,
                               bool   offload) {
+    const struct llama_hparams & hparams = model.hparams;
+
     const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
     const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-    const uint32_t n_layer      = hparams.n_layer;
+    const int64_t  n_layer      = hparams.n_layer;
 
     cache.has_shift = false;
 
@@ -1730,62 +1760,65 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(n_ctx);
 
-    struct ggml_init_params params;
-    params.mem_size   = 2u*n_layer*ggml_tensor_overhead();
-    params.mem_buffer = NULL;
-    params.no_alloc   = true;
+#ifdef GGML_USE_CLBLAST
+    offload = false;
+#endif
 
-    cache.ctx = ggml_init(params);
+    // count used buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    if (offload) {
+        for (int64_t i = 0; i < n_layer; ++i) {
+            buft_layer_count[model.buft_layer[i].buft]++;
+        }
+    } else {
+        buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
+    }
 
-    size_t vram_kv_cache = 0;
-
-    if (!cache.ctx) {
-        LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
-        return false;
+    // create a context for each buffer type
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (auto & it : buft_layer_count) {
+        int n_layers = it.second;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ 2u*n_layers*ggml_tensor_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_context * ctx = ggml_init(params);
+        if (!ctx) {
+            LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+            return false;
+        }
+        ctx_map[it.first] = ctx;
+        cache.ctxs.push_back(ctx);
     }
 
     cache.k_l.reserve(n_layer);
     cache.v_l.reserve(n_layer);
 
-    const int i_gpu_start = (int) n_layer - n_gpu_layers;
-
     for (int i = 0; i < (int) n_layer; i++) {
-        ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
+        struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
         cache.v_l.push_back(v);
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-        if (i >= i_gpu_start) {
-            if (offload) {
-                ggml_cuda_assign_buffers_no_scratch(k);
-                ggml_cuda_assign_buffers_no_scratch(v);
-                vram_kv_cache += ggml_nbytes(k);
-                vram_kv_cache += ggml_nbytes(v);
-                // HACK: mark tensor as allocated
-                k->data = v->data = (void *)(uintptr_t)1;
-            }
+    }
+
+    // allocate tensors and initialize the buffers to avoid NaNs in the padding
+    for (auto it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buf) {
+            LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+            return false;
         }
-#endif // GGML_USE_CUBLAS
+        ggml_backend_buffer_clear(buf, 0);
+        LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+        cache.bufs.push_back(buf);
     }
 
-    // allocate tensors
-    cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
-
-    // buf may be NULL with full offload
-    if (cache.buf) {
-        // initialize the buffer to avoid NaNs in the padding
-        ggml_backend_buffer_clear(cache.buf, 0);
-    }
-
-    if (vram_kv_cache > 0) {
-        LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
-    }
-
-    GGML_UNUSED(i_gpu_start);
-    GGML_UNUSED(offload);
-
     return true;
 }
 
@@ -2420,9 +2453,8 @@ struct llama_model_loader {
         return get_tensor_meta(get_tensor_name(i));
     }
 
-    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
+    struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
         struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
-        tensor->backend = backend; // TODO: ggml_set_backend
         ggml_set_name(tensor, ggml_get_name(meta));
 
         n_created++;
@@ -2430,7 +2462,7 @@ struct llama_model_loader {
         return tensor;
     }
 
-    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
+    struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
         if (cur == NULL) {
@@ -2440,12 +2472,6 @@ struct llama_model_loader {
             throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
         }
 
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            if (ne.size() == 1) {
-                throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
-            }
-        }
-
         {
             bool is_ok = true;
             for (size_t i = 0; i < ne.size(); ++i) {
@@ -2463,7 +2489,7 @@ struct llama_model_loader {
             }
         }
 
-        return create_tensor_for(ctx, cur, backend);
+        return create_tensor_for(ctx, cur);
     }
 
     void done_getting_tensors() const {
@@ -2482,26 +2508,36 @@ struct llama_model_loader {
         return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
     }
 
-    void init_mapping(bool prefetch = true) {
-        /*
-        // prefetch only CPU tensors
-        if (use_mmap) {
-            size_t size_pref = 0; // prefetch
-
-            for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-                struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-                if (cur->backend == GGML_BACKEND_CPU) {
-                    size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
-                    size_pref = std::max(size_pref, tensor_end);
-                }
-            }
-            mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
-        }
-        */
+    void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
         // prefetch the whole file - all the data is needed anyway
         if (use_mmap) {
             mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
         }
+
+        // compute the total size of all tensors for progress reporting
+        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+            size_data += ggml_nbytes(cur);
+        }
+
+        if (use_mmap && mapping) {
+            if (lmlock) {
+                lmlock->init(mapping->addr);
+            }
+            mmap_used_first = mapping->size;
+        }
+    }
+
+    void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
+        GGML_ASSERT(mapping);
+
+        *first = mapping->size;
+        *last  = 0;
+        for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+            const size_t offs = file_offset(ggml_get_name(tensor));
+            *first = std::min(*first, offs);
+            *last  = std::max(*last,  offs + ggml_nbytes(tensor));
+        }
     }
 
     // for backwards compatibility, does not support ggml-backend
@@ -2509,8 +2545,11 @@ struct llama_model_loader {
         const size_t offs = file_offset(ggml_get_name(cur));
 
         if (use_mmap && mapping) {
-            GGML_ASSERT(cur->data == nullptr);
-            cur->data = (uint8_t *)mapping->addr + offs;
+            if (cur->data == nullptr) {
+                cur->data = (uint8_t *)mapping->addr + offs;
+            } else {
+                memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
+            }
         } else {
             GGML_ASSERT(cur->data != nullptr);
             file.seek(offs, SEEK_SET);
@@ -2518,37 +2557,23 @@ struct llama_model_loader {
         }
     }
 
+    size_t size_done = 0;
+    size_t size_data = 0;
+    size_t mmap_used_first = -1;
+    size_t mmap_used_last  = 0;
+
     // Returns false if cancelled by progress_callback
-    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
-        size_t size_data = 0;
-
-        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
-            struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            size_data += ggml_nbytes(cur);
-        }
-
-        if (use_mmap && buf_mmap) {
-            if (lmlock) {
-                lmlock->init(mapping->addr);
-            }
-        }
-
-#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
-        const bool legacy_offload = true;
-#else
-        const bool legacy_offload = false;
-#endif
+    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
+        GGML_ASSERT(size_data != 0 && "call init_mapping() first");
 
         std::vector<no_init<uint8_t>> read_buf;
 
-        size_t size_done = 0;
-
-        size_t mmap_first = -1;
-        size_t mmap_last  = 0;
-
         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
             struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
-            GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
+            if (!cur) {
+                // some tensors may be allocated in a different context
+                continue;
+            }
 
             if (progress_callback) {
                 if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -2558,67 +2583,48 @@ struct llama_model_loader {
 
             const size_t offs = file_offset(ggml_get_name(cur));
 
-            if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
-                if (use_mmap && mapping) {
-                    if (buf_mmap) {
-                        ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
-                        if (lmlock) {
-                            lmlock->grow_to(offs + ggml_nbytes(cur));
-                        }
-                        mmap_first = std::min(mmap_first, offs);
-                        mmap_last  = std::max(mmap_last,  offs + ggml_nbytes(cur));
-                    } else {
-                        ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
+            if (use_mmap && mapping) {
+                if (buf_mmap && cur->data == nullptr) {
+                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
+                    if (lmlock) {
+                        lmlock->grow_to(offs + ggml_nbytes(cur));
                     }
+                    mmap_used_first = std::min(mmap_used_first, offs);
+                    mmap_used_last  = std::max(mmap_used_last,  offs + ggml_nbytes(cur));
                 } else {
-                    if (ggml_backend_buffer_is_host(cur->buffer)) {
-                        file.seek(offs, SEEK_SET);
-                        file.read_raw(cur->data, ggml_nbytes(cur));
-                    } else {
-                        read_buf.resize(ggml_nbytes(cur));
-                        file.seek(offs, SEEK_SET);
-                        file.read_raw(read_buf.data(), ggml_nbytes(cur));
-                        ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
-                    }
+                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
                 }
             } else {
-                // HACK: mark tensor as allocated
-                cur->data = (void *)(uintptr_t)1;
-                void * data;
-                if (use_mmap && mapping) {
-                    data = (uint8_t *) mapping->addr + offs;
+                if (ggml_backend_buffer_is_host(cur->buffer)) {
+                    file.seek(offs, SEEK_SET);
+                    file.read_raw(cur->data, ggml_nbytes(cur));
                 } else {
                     read_buf.resize(ggml_nbytes(cur));
                     file.seek(offs, SEEK_SET);
                     file.read_raw(read_buf.data(), ggml_nbytes(cur));
-                    data = read_buf.data();
+                    ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
                 }
-
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-                ggml_cuda_transform_tensor(data, cur);
-#elif defined(GGML_USE_CLBLAST)
-                GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
-                ggml_cl_transform_tensor(data, cur);
-#else
-                GGML_ASSERT(!"GPU tensor without a GPU backend");
-                GGML_UNUSED(data);
-#endif
             }
 
             size_done += ggml_nbytes(cur);
         }
 
-        // unmap offloaded tensors and metadata
-        if (use_mmap && mapping) {
-            mapping->unmap_fragment(0, mmap_first);
-            mapping->unmap_fragment(mmap_last, mapping->size);
+        // check if this is the last call and do final cleanup
+        if (size_done >= size_data) {
+            // unmap offloaded tensors and metadata
+            if (use_mmap && mapping) {
+                mapping->unmap_fragment(0, mmap_used_first);
+                if (mmap_used_last != 0) {
+                    mapping->unmap_fragment(mmap_used_last, mapping->size);
+                }
+            }
+            if (progress_callback) {
+                // Even though the model is done loading, we still honor
+                // cancellation since we need to free allocations.
+                return progress_callback(1.0f, progress_callback_user_data);
+            }
         }
 
-        if (progress_callback) {
-            // Even though the model is done loading, we still honor
-            // cancellation since we need to free allocations.
-            return progress_callback(1.0f, progress_callback_user_data);
-        }
         return true;
     }
 };
@@ -3245,6 +3251,7 @@ static bool llm_load_tensors(
         llama_model_loader & ml,
         llama_model & model,
         int n_gpu_layers,
+        enum llama_split_mode split_mode,
         int main_gpu,
         const float * tensor_split,
         bool use_mlock,
@@ -3252,702 +3259,563 @@ static bool llm_load_tensors(
         void * progress_callback_user_data) {
     model.t_start_us = ggml_time_us();
 
-    auto & ctx     = model.ctx;
     auto & hparams = model.hparams;
 
+    model.split_mode   = split_mode;
+    model.main_gpu     = main_gpu;
     model.n_gpu_layers = n_gpu_layers;
 
-    size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
+    const int64_t n_layer     = hparams.n_layer;
+    const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
 
-    LLAMA_LOG_INFO("%s: ggml ctx size       = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
+    // there is very little benefit to offloading the input layer, so always keep it on the CPU
+    model.buft_input = llama_default_buffer_type_cpu(true);
 
-    // create the ggml context
+    model.buft_layer.resize(n_layer);
+
+    // assign cpu layers
+    for (int64_t i = 0; i < i_gpu_start; ++i) {
+        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
+    }
+
+#ifdef GGML_USE_CUBLAS
+    if (split_mode == LLAMA_SPLIT_LAYER) {
+        // calculate the split points
+        int device_count = ggml_backend_cuda_get_device_count();
+        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
+        float splits[GGML_CUDA_MAX_DEVICES];
+        if (all_zero) {
+            // default split, by free memory
+            for (int i = 0; i < device_count; ++i) {
+                size_t total;
+                size_t free;
+                ggml_backend_cuda_get_device_memory(i, &total, &free);
+                splits[i] = free;
+            }
+        } else {
+            std::copy(tensor_split, tensor_split + device_count, splits);
+        }
+
+        // sum and normalize the splits to get the split points
+        float split_sum = 0.0f;
+        for (int i = 0; i < device_count; ++i) {
+            split_sum += splits[i];
+            splits[i] = split_sum;
+        }
+        for (int i = 0; i < device_count; ++i) {
+            splits[i] /= split_sum;
+        }
+
+        // assign the repeating layers to the devices according to the splits
+        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
+        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+            int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
+            model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
+        }
+        // assign the output layer
+        if (n_gpu_layers > n_layer) {
+            int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
+            model.buft_output = llama_default_buffer_type_offload(layer_gpu);
+        } else {
+            model.buft_output = llama_default_buffer_type_cpu(true);
+        }
+    } else
+#endif
     {
+        ggml_backend_buffer_type_t split_buft;
+        if (split_mode == LLAMA_SPLIT_ROW) {
+            split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
+        } else {
+            // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
+            split_buft = llama_default_buffer_type_offload(main_gpu);
+        }
+        // assign the repeating layers
+        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+            model.buft_layer[i] = {
+                split_buft,
+                llama_default_buffer_type_offload(main_gpu)
+            };
+        }
+        // assign the output layer
+        if (n_gpu_layers > n_layer) {
+            model.buft_output = {
+                split_buft,
+                llama_default_buffer_type_offload(main_gpu)
+            };
+        } else {
+            model.buft_output = llama_default_buffer_type_cpu(true);
+        }
+    }
+
+    // count used buffer types
+    std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+    buft_layer_count[model.buft_input.buft]++;
+    buft_layer_count[model.buft_input.buft_matrix]++;
+    buft_layer_count[model.buft_output.buft]++;
+    buft_layer_count[model.buft_output.buft_matrix]++;
+    for (int64_t i = 0; i < n_layer; ++i) {
+        buft_layer_count[model.buft_layer[i].buft]++;
+        buft_layer_count[model.buft_layer[i].buft_matrix]++;
+    }
+
+    // create one context per buffer type
+    size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
+    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+    for (auto & it : buft_layer_count) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ ctx_size,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
         };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            throw std::runtime_error(format("ggml_init() failed"));
+        ggml_context * ctx = ggml_init(params);
+        if (!ctx) {
+            throw std::runtime_error(format("failed to create context"));
         }
+        ctx_map[it.first] = ctx;
+        model.ctxs.push_back(ctx);
     }
 
-    (void) main_gpu;
-
-    enum ggml_backend_type llama_backend_offload       = GGML_BACKEND_CPU;
-    enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
-
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    if (ggml_cublas_loaded()) {
-        LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
-        ggml_cuda_set_main_device(main_gpu);
-
-        llama_backend_offload       = GGML_BACKEND_GPU;
-        llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
-    }
-#elif defined(GGML_USE_CLBLAST)
-        LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
-        llama_backend_offload       = GGML_BACKEND_GPU;
-        llama_backend_offload_split = GGML_BACKEND_GPU;
-#endif
+    LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
 
     // create tensors for the weights
     {
         const int64_t n_embd       = hparams.n_embd;
         const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
         const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
-        const int64_t n_layer      = hparams.n_layer;
+        const int64_t n_embd_gqa   = n_embd_v_gqa;
         const int64_t n_vocab      = hparams.n_vocab;
+        const int64_t n_ff         = hparams.n_ff;
+
+        GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+
+        ggml_context * ctx_input        = ctx_map.at(model.buft_input.buft);
+        ggml_context * ctx_output       = ctx_map.at(model.buft_output.buft);
+        ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
+        auto ctx_for_layer              = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
+        auto ctx_for_layer_split        = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
+
+        model.layers.resize(n_layer);
 
         const auto tn = LLM_TN(model.arch);
         switch (model.arch) {
             case LLM_ARCH_LLAMA:
             case LLM_ARCH_REFACT:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
                         // optional bias tensors
-                        layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     backend, false);
-                        layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, backend, false);
-                        layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, backend, false);
-                        layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     backend, false);
+                        layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd},     false);
+                        layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, false);
+                        layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, false);
+                        layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd},     false);
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
+                        layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
 
                         if (layer.ffn_gate_inp == nullptr) {
                             GGML_ASSERT(hparams.n_expert      == 0);
                             GGML_ASSERT(hparams.n_expert_used == 0);
 
-                            layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                            layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                            layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                            layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                            layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                         } else {
                             GGML_ASSERT(hparams.n_expert      > 0);
                             GGML_ASSERT(hparams.n_expert_used > 0);
 
                             // MoE branch
                             for (uint32_t x = 0; x < hparams.n_expert; ++x) {
-                                layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd,   n_ff}, backend_split);
-                                layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {  n_ff, n_embd}, backend_split);
-                                layer.ffn_up_exp[x]   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd,   n_ff}, backend_split);
+                                layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd,   n_ff});
+                                layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), {  n_ff, n_embd});
+                                layer.ffn_up_exp[x]   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP,   "weight", i, x), {n_embd,   n_ff});
                             }
                         }
                     }
                 } break;
             case LLM_ARCH_BAICHUAN:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
             case LLM_ARCH_FALCON:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
                         if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
-                            layer.attn_norm_2   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
-                            layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, backend);
+                            layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+                            layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd});
                         }
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
             case LLM_ARCH_STARCODER:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"), {n_embd, hparams.n_ctx_train});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
 
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
 
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
 
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
 
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i),   {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i),     {n_ff});
                     }
                 } break;
             case LLM_ARCH_PERSIMMON:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab});
 
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm    = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b  = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output         = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm    = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b  = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output         = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-                    model.layers.resize(n_layer);
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
                         auto & layer = model.layers[i];
-                        layer.attn_norm     = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd}, backend);
-                        layer.wqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv          = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa},         backend);
-                        layer.wo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo            = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd},           backend);
-                        layer.ffn_down      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd},       backend);
-                        layer.ffn_up        = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff},           backend);
-                        layer.ffn_norm      = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b    = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd}, backend);
-                        layer.attn_q_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
-                        layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64}, backend);
-                        layer.attn_k_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
-                        layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64}, backend);
+
+                        layer.attn_norm     = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
+                        layer.attn_norm_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias",   i), {n_embd});
+
+                        layer.wqkv          = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV,    "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv          = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV,    "bias",   i), {n_embd + 2*n_embd_gqa});
+
+                        layer.wo            = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT,    "weight", i), {n_embd, n_embd});
+                        layer.bo            = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT,    "bias",   i), {n_embd});
+
+                        layer.ffn_down      = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN,    "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b    = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN,    "bias",   i), {n_embd});
+
+                        layer.ffn_up        = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,      "weight", i), {n_embd, n_ff});
+                        layer.ffn_up_b      = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,      "bias",   i), {n_ff});
+
+                        layer.ffn_norm      = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,    "weight", i), {n_embd});
+                        layer.ffn_norm_b    = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM,    "bias",   i), {n_embd});
+
+                        layer.attn_q_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
+                        layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias",   i), {64});
+
+                        layer.attn_k_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
+                        layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias",   i), {64});
                     }
                 } break;
             case LLM_ARCH_BLOOM:
                 {
-                    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
-                    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
-                    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);
+                    model.tok_embd   = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab});
+                    model.tok_norm   = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
+                    model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
 
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},                        backend);
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
 
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
 
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
 
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff},           backend);
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                     }
                 } break;
             case LLM_ARCH_MPT:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
 
                         // AWQ ScaleActivation layer
-                        layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
+                        layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
                     }
                 } break;
             case LLM_ARCH_STABLELM:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd},          backend_norm);
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        /*
-                        llama_model_loader: - tensor    4:         blk.0.attn_output.weight f16      [  2560,  2560,     1,     1 ]
-                        */
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+                        layer.attn_norm =   ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
 
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
             case LLM_ARCH_QWEN:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
+                    // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
 
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-                   }
-
-                    const uint32_t n_ff = hparams.n_ff / 2;
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd * 3},         backend);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd*3});
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+                        layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
 
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff/2});
                     }
                 } break;
             case LLM_ARCH_PHI2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
-                        model.output_b      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab},         backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output_b      = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT,      "bias"),   {n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
 
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},         backend);
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
 
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
 
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i),   {n_ff},         backend);
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                     }
                 } break;
             case LLM_ARCH_PLAMO:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output      = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output      = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd},     backend_split);
-                        layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
-                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);
+                        layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
+                        layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
+                        layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
-                        layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
-                        layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);
+                        layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                        layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd});
+                        layer.ffn_up   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff});
                     }
                 } break;
             case LLM_ARCH_GPT2:
                 {
-                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD,   "weight"),   {n_embd, hparams.n_ctx_train});
 
                     // output
                     {
-                        ggml_backend_type backend_norm;
-                        ggml_backend_type backend_output;
-
-                        if (n_gpu_layers > int(n_layer)) {
-                            backend_norm   = llama_backend_offload;
-                            backend_output = llama_backend_offload_split;
-                        } else {
-                            backend_norm   = GGML_BACKEND_CPU;
-                            backend_output = GGML_BACKEND_CPU;
-                        }
-
-                        model.output_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd},          backend_norm);
-                        model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd},          backend_norm);
-                        model.output        = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, backend_output);
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
-                    const uint32_t n_ff      = hparams.n_ff;
-                    const int64_t n_embd_gqa = n_embd_v_gqa;
-                    GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
-                    GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
-                    const int i_gpu_start = n_layer - n_gpu_layers;
-
-                    model.layers.resize(n_layer);
-
-                    for (uint32_t i = 0; i < n_layer; ++i) {
-                        const ggml_backend_type backend       = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
-                        const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
 
                         auto & layer = model.layers[i];
 
-                        layer.attn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd}, backend);
-                        layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd}, backend);
+                        layer.attn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "weight", i), {n_embd});
+                        layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM,   "bias", i),   {n_embd});
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa},         backend);
+                        layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+                        layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i),   {n_embd + 2*n_embd_gqa});
 
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},   backend_split);
-                        layer.bo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd},           backend);
+                        layer.wo   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+                        layer.bo   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i),   {n_embd});
 
-                        layer.ffn_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
-                        layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd}, backend);
+                        layer.ffn_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+                        layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i),   {n_embd});
 
-                        layer.ffn_down   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
-                        layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd},       backend);
+                        layer.ffn_down   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+                        layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
 
-                        layer.ffn_up   = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, backend_split);
-                        layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "bias", i),           {n_ff}, backend);
+                        layer.ffn_up     = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff});
+                        layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP,   "bias", i),   {n_ff});
                     }
                 } break;
             default:
@@ -3957,78 +3825,51 @@ static bool llm_load_tensors(
 
     ml.done_getting_tensors();
 
-    ml.init_mapping();
+    ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
 
-    // allocate tensors
-    size_t vram_weights = 0;
-    size_t buf_size = 0;
+    // create the backend buffers
+    std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
 
-    ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
+    for (auto & it : ctx_map) {
+        ggml_backend_buffer_type_t buft = it.first;
+        ggml_context * ctx = it.second;
+        ggml_backend_buffer_t buf = nullptr;
 
-    for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-        // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
-        if (t->backend == GGML_BACKEND_CPU) {
-            buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
-        } else {
-            vram_weights += ggml_nbytes(t);
+        // only the mmap region containing the tensors in the model is mapped to the backend buffer
+        // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+        // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+        if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
         }
-    }
-
-    // create backend buffer
-    ggml_backend_buffer_t buf_mmap = nullptr;
-
 #ifdef GGML_USE_METAL
-    if (n_gpu_layers > 0) {
-        if (ml.use_mmap) {
+        else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
             const size_t max_size = ggml_get_max_tensor_size(ctx);
-            model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
-            buf_mmap = model.buf;
-        } else {
-            model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
+            size_t first, last;
+            ml.get_mapping_range(&first, &last, ctx);
+            buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
         }
-    }
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    // for testing only
-    if (n_gpu_layers > 0) {
-        model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
-    }
 #endif
-
-    if (model.buf == nullptr) {
-        // CPU backend, and indirectly CUDA and OpenCL
-        if (ml.use_mmap) {
-            model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
-            buf_mmap = model.buf;
-        } else {
-            // allocate only CPU tensors
-            model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
-            ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
-            for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-                if (t->backend == GGML_BACKEND_CPU) {
-                    ggml_tallocr_alloc(alloc, t);
-                }
+        else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+            if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
+                model.mlock_buf.init   (ggml_backend_buffer_get_base(buf));
+                model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
             }
-            ggml_tallocr_free(alloc);
         }
-    }
-
-    if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
-        model.mlock_buf.init   (ggml_backend_buffer_get_base(model.buf));
-        model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
+        if (buf == nullptr) {
+            throw std::runtime_error("failed to allocate buffer");
+        }
+        // indicate that this buffer contains weights
+        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
+        ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+        model.bufs.push_back(buf);
+        ctx_bufs.emplace_back(ctx, buf);
     }
 
     // print memory requirements
     {
-        size_t sys_mem_required = ctx_size + buf_size;
-
-        if (sys_mem_required > 0) {
-            LLAMA_LOG_INFO("%s: system memory used  = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
-        }
-        if (vram_weights > 0) {
-            LLAMA_LOG_INFO("%s: VRAM used           = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
-        }
-
-#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
 
         LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -4040,23 +3881,26 @@ static bool llm_load_tensors(
         const int max_offloadable_layers       = hparams.n_layer + 1;
 
         LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
-    }
 
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    ggml_cuda_set_tensor_split(tensor_split);
-#else
-    GGML_UNUSED(tensor_split);
-#endif // GGML_USE_CUBLAS
+        for (ggml_backend_buffer_t buf : model.bufs) {
+            LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+        }
+    }
 
     // populate tensors_by_name
-    for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
-        model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+    for (ggml_context * ctx : model.ctxs) {
+        for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+            model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+        }
     }
 
-    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
-        return false;
+    // load tensor data
+    for (auto & it : ctx_bufs) {
+        ggml_context * ctx = it.first;
+        ggml_backend_buffer_t buf = it.second;
+        if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
+            return false;
+        }
     }
 
     model.mapping = std::move(ml.mapping);
@@ -4105,13 +3949,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
 #endif
 
         if (!llm_load_tensors(
-            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
+            ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock,
             params.progress_callback, params.progress_callback_user_data
         )) {
             return -2;
         }
     } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
+        LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
         return -1;
     }
 
@@ -4566,8 +4410,6 @@ struct llm_build_context {
       , ctx_kompute   (lctx.ctx_kompute)
 #endif
         {
-            GGML_ASSERT(!!kv_self.ctx);
-
             // all initializations should be done in init()
         }
 
@@ -4647,6 +4489,12 @@ struct llm_build_context {
                     cb(Vcur, "Vcur", il);
                 }
 
+                // these nodes are added to the graph together so that they are not reordered
+                // by doing so, the number of splits in the graph is reduced
+                ggml_build_forward_expand(gf, Qcur);
+                ggml_build_forward_expand(gf, Kcur);
+                ggml_build_forward_expand(gf, Vcur);
+
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
                     n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
@@ -6165,199 +6013,13 @@ struct llm_build_context {
     }
 };
 
-//
-// tensor offloading helpers
-//
-// TODO: will be removed with backend v2
-
-enum llm_offload_func_e {
-    OFFLOAD_FUNC_NOP,
-    OFFLOAD_FUNC,
-    OFFLOAD_FUNC_FRC, // force offload
-    OFFLOAD_FUNC_KQV,
-    OFFLOAD_FUNC_NR,
-    OFFLOAD_FUNC_EMB, // embeddings
-    OFFLOAD_FUNC_OUT,
-};
-
-// TODO: will be removed with backend v2
-struct llm_offload_trie {
-    struct node {
-        ~node() {
-            for (int i = 0; i < 256; ++i) {
-                if (children[i]) {
-                    delete children[i];
-                }
-            }
-        }
-
-        node * children[256] = { nullptr };
-        llm_offload_func_e func = OFFLOAD_FUNC_NOP;
-    };
-
-    llm_offload_trie() {
-        root = new node;
-    }
-
-    llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
-        root = new node;
-
-        for (const auto & kv : map) {
-            add(kv.first, kv.second);
-        }
-    }
-
-    ~llm_offload_trie() {
-        delete root;
-    }
-
-    void add(const char * name, llm_offload_func_e func) {
-        node * cur = root;
-
-        for (int i = 0; ; ++i) {
-            const uint8_t c = name[i];
-
-            if (!c) {
-                break;
-            }
-
-            if (!cur->children[c]) {
-                cur->children[c] = new node;
-            }
-
-            cur = cur->children[c];
-        }
-
-        cur->func = func;
-    }
-
-    llm_offload_func_e find(const char * name) const {
-        const node * cur = root;
-
-        for (int i = 0; ; ++i) {
-            const uint8_t c = name[i];
-
-            if (!c) {
-                break;
-            }
-
-            if (!cur->children[c]) {
-                return OFFLOAD_FUNC_NOP;
-            }
-
-            cur = cur->children[c];
-        }
-
-        return cur->func;
-    }
-
-    node * root = nullptr;
-};
-
-// TODO: will be removed with backend v2
-static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
-  //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
-  //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
-    { "pos_embd",                   OFFLOAD_FUNC_NR  },
-
-    { "inp_pos",                    OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
-    { "KQ_mask",                    OFFLOAD_FUNC_FRC },
-    { "K_shift",                    OFFLOAD_FUNC_FRC },
-
-    { "K_shifted",                  OFFLOAD_FUNC     },
-
-    { "inp_norm",                   OFFLOAD_FUNC_NR  },
-    { "inp_norm_w",                 OFFLOAD_FUNC_NR  },
-    { "inp_norm_wb",                OFFLOAD_FUNC_NR  },
-
-    { "norm",                       OFFLOAD_FUNC     },
-    { "norm_w",                     OFFLOAD_FUNC     },
-    { "norm_wb",                    OFFLOAD_FUNC     },
-
-    { "attn_norm",                  OFFLOAD_FUNC     },
-    { "attn_norm_2",                OFFLOAD_FUNC     },
-
-    { "wqkv",                       OFFLOAD_FUNC_KQV },
-    { "bqkv",                       OFFLOAD_FUNC_KQV },
-    { "wqkv_clamped",               OFFLOAD_FUNC_KQV },
-
-    { "tmpk",                       OFFLOAD_FUNC_KQV },
-    { "tmpq",                       OFFLOAD_FUNC_KQV },
-    { "tmpv",                       OFFLOAD_FUNC_KQV },
-    { "Kcur",                       OFFLOAD_FUNC_KQV },
-    { "Qcur",                       OFFLOAD_FUNC_KQV },
-    { "Vcur",                       OFFLOAD_FUNC_KQV },
-
-    { "krot",                       OFFLOAD_FUNC_KQV },
-    { "qrot",                       OFFLOAD_FUNC_KQV },
-    { "kpass",                      OFFLOAD_FUNC_KQV },
-    { "qpass",                      OFFLOAD_FUNC_KQV },
-    { "krotated",                   OFFLOAD_FUNC_KQV },
-    { "qrotated",                   OFFLOAD_FUNC_KQV },
-
-    { "q",                          OFFLOAD_FUNC_KQV },
-    { "k",                          OFFLOAD_FUNC_KQV },
-    { "kq",                         OFFLOAD_FUNC_KQV },
-    { "kq_scaled",                  OFFLOAD_FUNC_KQV },
-    { "kq_scaled_alibi",            OFFLOAD_FUNC_KQV },
-    { "kq_masked",                  OFFLOAD_FUNC_KQV },
-    { "kq_soft_max",                OFFLOAD_FUNC_KQV },
-    { "kq_soft_max_ext",            OFFLOAD_FUNC_KQV },
-    { "v",                          OFFLOAD_FUNC_KQV },
-    { "kqv",                        OFFLOAD_FUNC_KQV },
-    { "kqv_merged",                 OFFLOAD_FUNC_KQV },
-    { "kqv_merged_cont",            OFFLOAD_FUNC_KQV },
-    { "kqv_wo",                     OFFLOAD_FUNC_KQV },
-    { "kqv_out",                    OFFLOAD_FUNC_KQV },
-
-    { "ffn_inp",                    OFFLOAD_FUNC     },
-    { "ffn_norm",                   OFFLOAD_FUNC     },
-
-    { "ffn_up",                     OFFLOAD_FUNC     },
-    { "ffn_up_b",                   OFFLOAD_FUNC     },
-    { "ffn_gate",                   OFFLOAD_FUNC     },
-    { "ffn_gate_b",                 OFFLOAD_FUNC     },
-    { "ffn_gate_par",               OFFLOAD_FUNC     },
-    { "ffn_act",                    OFFLOAD_FUNC     },
-    { "ffn_down",                   OFFLOAD_FUNC     },
-    { "ffn_down_b",                 OFFLOAD_FUNC     },
-    { "ffn_out",                    OFFLOAD_FUNC     },
-
-    { "ffn_silu",                   OFFLOAD_FUNC     },
-    { "ffn_gelu",                   OFFLOAD_FUNC     },
-    { "ffn_relu",                   OFFLOAD_FUNC     },
-    { "ffn_sqr(relu)",              OFFLOAD_FUNC     },
-
-    { "ffn_moe_logits",             OFFLOAD_FUNC     },
-    { "ffn_moe_probs",              OFFLOAD_FUNC     },
-    { "ffn_moe_argsort",            OFFLOAD_FUNC     },
-    { "ffn_moe_weights",            OFFLOAD_FUNC     },
-    { "ffn_moe_weights_sum",        OFFLOAD_FUNC     },
-    { "ffn_moe_weights_norm",       OFFLOAD_FUNC     },
-    { "ffn_moe_weighted",           OFFLOAD_FUNC     },
-    { "ffn_moe_up",                 OFFLOAD_FUNC     },
-    { "ffn_moe_gate",               OFFLOAD_FUNC     },
-    { "ffn_moe_silu",               OFFLOAD_FUNC     },
-    { "ffn_moe_gate_par",           OFFLOAD_FUNC     },
-    { "ffn_moe_down",               OFFLOAD_FUNC     },
-    { "ffn_moe_out",                OFFLOAD_FUNC     },
-
-    { "l_out",                      OFFLOAD_FUNC     },
-
-    { "result_norm",                OFFLOAD_FUNC_EMB },
-    { "result_output_no_bias",      OFFLOAD_FUNC_EMB },
-    { "result_output",              OFFLOAD_FUNC_OUT },
-};
-
-static llm_offload_trie k_offload_func_trie(k_offload_map);
-
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch) {
     const auto & model = lctx.model;
 
     // check if we should build the worst-case graph (for memory measurement)
-    const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
+    const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
 
     // keep track of the input that has already been allocated
     bool alloc_inp_tokens   = false;
@@ -6366,20 +6028,13 @@ static struct ggml_cgraph * llama_build_graph(
     bool alloc_inp_KQ_mask  = false;
     bool alloc_inp_K_shift  = false;
 
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    const bool do_offload = true;
-#else
-    const bool do_offload = true; // TODO: set to false after finishing refactoring
-#endif
-
+    // TODO(jared): do we still need this?
 #ifdef GGML_USE_KOMPUTE
     const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
 #endif
 
-    int n_non_view = 0; // number of non-view tensors that have been processed by the callback
-
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
-    // TODO: will be removed with backend v2
+    // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
         if (il >= 0) {
             ggml_format_name(cur, "%s-%d", name, il);
@@ -6390,12 +6045,11 @@ static struct ggml_cgraph * llama_build_graph(
         //
         // allocate input tensors and set input data
         //
-        // TODO: will be removed with backend v2
 
         if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
-            ggml_allocr_alloc(lctx.alloc, cur);
+            ggml_tallocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
+            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
                 const int64_t n_tokens = cur->ne[0];
 
                 ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
@@ -6404,10 +6058,10 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_tokens = true;
         }
 
-        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
-            ggml_allocr_alloc(lctx.alloc, cur);
+        if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
+            ggml_tallocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
+            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
                 const int64_t n_embd   = cur->ne[0];
                 const int64_t n_tokens = cur->ne[1];
 
@@ -6418,9 +6072,9 @@ static struct ggml_cgraph * llama_build_graph(
         }
 
         if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
-            ggml_allocr_alloc(lctx.alloc, cur);
+            ggml_tallocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
+            if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
                 const int64_t n_tokens = cur->ne[0];
 
                 static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
@@ -6431,9 +6085,9 @@ static struct ggml_cgraph * llama_build_graph(
         }
 
         if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
-            ggml_allocr_alloc(lctx.alloc, cur);
+            ggml_tallocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            if (!ggml_tallocr_is_measure(lctx.alloc)) {
                 const int64_t n_kv     = cur->ne[0];
                 const int64_t n_tokens = cur->ne[1];
 
@@ -6471,9 +6125,9 @@ static struct ggml_cgraph * llama_build_graph(
         }
 
         if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
-            ggml_allocr_alloc(lctx.alloc, cur);
+            ggml_tallocr_alloc(lctx.alloc, cur);
 
-            if (!ggml_allocr_is_measure(lctx.alloc)) {
+            if (!ggml_tallocr_is_measure(lctx.alloc)) {
                 const int64_t n_ctx = cur->ne[0];
 
                 int32_t * data;
@@ -6496,15 +6150,7 @@ static struct ggml_cgraph * llama_build_graph(
             alloc_inp_K_shift = true;
         }
 
-        // view tensors are not processed further
-        if (cur->view_src != nullptr) {
-            return;
-        }
-
-        if (cur->op != GGML_OP_NONE) {
-            n_non_view++;
-        }
-
+        // TODO(jared): this shouldn't be needed anymore
 #ifdef GGML_USE_KOMPUTE
         if (lctx.ctx_kompute && !needs_h2d_all) {
             const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
@@ -6519,127 +6165,6 @@ static struct ggml_cgraph * llama_build_graph(
             }
         }
 #endif
-
-        //
-        // offload layers
-        //
-        // TODO: will be removed with backend v2
-
-//#define LLAMA_OFFLOAD_DEBUG
-
-        if (!do_offload) {
-            return;
-        }
-
-        const int n_layer = model.hparams.n_layer;
-
-        const int n_gpu_layers = model.n_gpu_layers;
-        const int i_gpu_start  = n_layer - n_gpu_layers;
-
-        // should we offload the final norm? yes if we are not computing embeddings
-        const bool offload_emb = lctx.embedding.empty();
-
-        static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
-            { OFFLOAD_FUNC_NOP, "CPU" },
-            { OFFLOAD_FUNC_OUT, "CPU" },
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-            { OFFLOAD_FUNC,     "GPU (CUDA)"     },
-            { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
-            { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
-            { OFFLOAD_FUNC_NR,  "GPU (CUDA) NR"  },
-            { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
-#else
-            { OFFLOAD_FUNC,     "CPU" },
-            { OFFLOAD_FUNC_FRC, "CPU" },
-            { OFFLOAD_FUNC_KQV, "CPU" },
-            { OFFLOAD_FUNC_NR,  "CPU" },
-            { OFFLOAD_FUNC_EMB, "CPU" },
-#endif // GGML_USE_CUBLAS
-        };
-
-        // check the global map for what offload function to use for this tensor
-        llm_offload_func_e func_e = k_offload_func_trie.find(name);
-
-        if (func_e == OFFLOAD_FUNC_NOP) {
-#ifdef LLAMA_OFFLOAD_DEBUG
-            // if a tensor hasn't been offloaded, we warn the user
-            if (worst_case) {
-                LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
-                        cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
-            }
-#endif
-
-            return;
-        }
-
-        // count the number of layers and respect the provided n_gpu_layers
-        switch (func_e) {
-            case OFFLOAD_FUNC_NOP:
-            case OFFLOAD_FUNC_OUT:
-                break;
-            case OFFLOAD_FUNC:
-                if (n_gpu_layers < n_layer) {
-                    if (il < i_gpu_start) {
-                        func_e = OFFLOAD_FUNC_NOP;
-                    }
-                }
-                break;
-            case OFFLOAD_FUNC_FRC:
-                if (!lctx.cparams.offload_kqv) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                } break;
-            case OFFLOAD_FUNC_KQV:
-                if (!lctx.cparams.offload_kqv) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                } else {
-                    if (n_gpu_layers < n_layer) {
-                        if (il < i_gpu_start) {
-                            func_e = OFFLOAD_FUNC_NOP;
-                        }
-                    }
-                }
-                break;
-            case OFFLOAD_FUNC_NR:
-                if (n_gpu_layers <= n_layer + 0) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                }
-                break;
-            case OFFLOAD_FUNC_EMB:
-                if (!offload_emb || n_gpu_layers < n_layer) {
-                    func_e = OFFLOAD_FUNC_NOP;
-                }
-                break;
-            default: GGML_ASSERT(false);
-        }
-
-        offload_func_t func = ggml_offload_nop;
-
-        // this is needed for compatibility with Metal for example
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-        static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
-#else
-        static offload_func_t ggml_offload_gpu = ggml_offload_nop;
-#endif
-
-        switch (func_e) {
-            case OFFLOAD_FUNC_NOP:
-            case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
-            case OFFLOAD_FUNC:
-            case OFFLOAD_FUNC_KQV:
-            case OFFLOAD_FUNC_FRC:
-            case OFFLOAD_FUNC_NR:
-            case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
-            default: GGML_ASSERT(false);
-        }
-
-        // apply offload function to the tensor
-        func(cur);
-
-#ifdef LLAMA_OFFLOAD_DEBUG
-        if (worst_case) {
-            LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
-        }
-#endif
     };
 
     struct ggml_cgraph * result = NULL;
@@ -6713,27 +6238,6 @@ static struct ggml_cgraph * llama_build_graph(
 
     llm.free();
 
-    if (worst_case) {
-        int n_non_view_total = 0;
-
-        for (int i = 0; i < result->n_nodes; ++i) {
-            if (result->nodes[i]->view_src == nullptr) {
-                n_non_view_total++;
-            }
-        }
-
-        LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
-
-        if (n_non_view != n_non_view_total) {
-            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
-            LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__);
-            LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n",    __func__);
-            LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n",                     __func__);
-            LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__);
-            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
-        }
-    }
-
     return result;
 }
 
@@ -6779,8 +6283,6 @@ static int llama_decode_internal(
 
     auto & kv_self = lctx.kv_self;
 
-    GGML_ASSERT(!!kv_self.ctx);
-
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
@@ -6834,12 +6336,8 @@ static int llama_decode_internal(
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
-    ggml_allocr_reset(lctx.alloc);
-
     ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
-    ggml_allocr_alloc_graph(lctx.alloc, gf);
-
     // the output is always the last tensor in the graph
     struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
     GGML_ASSERT(strcmp(res->name, "result_output") == 0);
@@ -6851,30 +6349,6 @@ static int llama_decode_internal(
         GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
     }
 
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-    char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
-    for (int i = 0; i < gf->n_leafs; i++) {
-        ggml_tensor * node = gf->leafs[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
-            ggml_cuda_copy_to_device(node);
-        }
-    }
-
-    for (int i = 0; i < gf->n_nodes; i++) {
-        ggml_tensor * node = gf->nodes[i];
-        if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
-            ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
-        }
-    }
-
-    // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
-    if (!lctx.embedding.empty()) {
-        embeddings->backend = GGML_BACKEND_CPU;
-    }
-    res->backend = GGML_BACKEND_CPU;
-#endif
-
     // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
 
     // for big prompts, if BLAS is enabled, it is better to use only one thread
@@ -6897,8 +6371,8 @@ static int llama_decode_internal(
 #endif
 
 #ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(lctx.backend)) {
-        ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
+    if (ggml_backend_is_metal(lctx.backend_metal)) {
+        ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
     }
 #elif defined(GGML_USE_KOMPUTE)
     if (lctx.ctx_kompute && n_tokens == 1) {
@@ -6921,10 +6395,12 @@ static int llama_decode_internal(
     }
 #endif
 
-    if (ggml_backend_is_cpu(lctx.backend)) {
-        ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
+    if (lctx.backend_cpu != nullptr) {
+        ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
     }
-    ggml_backend_graph_compute(lctx.backend, gf);
+    ggml_backend_sched_graph_compute(lctx.sched, gf);
+
+    // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
 
 #ifdef GGML_USE_MPI
     ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
@@ -9473,48 +8949,23 @@ static int llama_apply_lora_from_file_internal(
 
     LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
 
-    // create a name -> tensor map of the model to accelerate lookups
-    // find the max tensor size to estimate the required temporary buffer size
-    size_t max_tensor_size = 0;
-    std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
-    for (const auto & kv : model.tensors_by_name) {
-        model_tensors.insert(kv);
-        size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
-        max_tensor_size = std::max(max_tensor_size, f32_size);
-    }
-
-    // create a temporary ggml context to store the lora tensors
-    // TODO: use ggml-alloc
-    size_t lora_ctx_size = max_tensor_size * 3;
-    LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
-    std::vector<uint8_t> lora_buf(lora_ctx_size);
-
-    struct ggml_init_params params;
-    params.mem_size   = lora_buf.size();
-    params.mem_buffer = lora_buf.data();
-    params.no_alloc   = false;
-
-    using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
-
-    unique_context lora_ctx(nullptr, ggml_free);
-    lora_ctx.reset(ggml_init(params));
-    std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
-
     // load base model
     std::unique_ptr<llama_model_loader> ml;
-
-   if (path_base_model) {
+    if (path_base_model) {
         LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
         ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
-        ml->init_mapping(false); // no prefetching
+        ml->init_mapping(/*prefetch*/ false); // no prefetching
     }
 
-    // read tensors and apply
-    bool warned = false;
-    int n_tensors = 0;
-
-    std::vector<uint8_t> work_buffer;
+    struct tensor_meta {
+        std::string name;
+        ggml_type type;
+        int32_t ne[2];
+        size_t offset;
+    };
+    std::map<std::string, tensor_meta> tensor_meta_map;
 
+    // load all tensor meta
     while (true) {
         if (fin.tell() == fin.size) {
             // eof
@@ -9527,7 +8978,7 @@ static int llama_apply_lora_from_file_internal(
 
         fin.read_raw(&n_dims, sizeof(n_dims));
         fin.read_raw(&name_len, sizeof(name_len));
-        fin.read_raw(&ftype,  sizeof(ftype));
+        fin.read_raw(&ftype, sizeof(ftype));
 
         if (n_dims != 1 && n_dims != 2) {
             LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
@@ -9541,31 +8992,23 @@ static int llama_apply_lora_from_file_internal(
 
         std::string name;
         {
-            GGML_ASSERT(name_len <= 1024);
-            char buf[1024];
+            GGML_ASSERT(name_len < GGML_MAX_NAME);
+            char buf[GGML_MAX_NAME];
             fin.read_raw(buf, name_len);
             name = std::string(buf, name_len);
         }
 
-        // check for lora suffix and get the type of tensor
-        const std::string lora_suffix = ".lora";
-        size_t pos = name.rfind(lora_suffix);
-        if (pos == std::string::npos) {
+        // check for lora suffix
+        std::string lora_suffix;
+        if (name.length() > 6) {
+            lora_suffix = name.substr(name.length() - 6);
+        }
+        if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
             LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
             return 1;
         }
 
-        std::string lora_type = name.substr(pos + lora_suffix.length());
-        std::string base_name = name;
-        base_name.erase(pos);
-        // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
-
-        if (model_tensors.find(base_name) == model_tensors.end()) {
-            LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
-            return 1;
-        }
-
-        // create ggml tensor
+        // tensor type
         ggml_type wtype;
         switch (ftype) {
             case 0: wtype = GGML_TYPE_F32;  break;
@@ -9577,122 +9020,177 @@ static int llama_apply_lora_from_file_internal(
                         return false;
                     }
         }
-        ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
-        ggml_set_name(lora_tensor, name.c_str());
 
-        // load tensor data
+        // data offset
         size_t offset = fin.tell();
-        size_t tensor_data_size = ggml_nbytes(lora_tensor);
         offset = (offset + 31) & -32;
-        fin.seek(offset, SEEK_SET);
-        fin.read_raw(lora_tensor->data, tensor_data_size);
 
-        lora_tensors[name] = lora_tensor;
+        // skip tensor data
+        fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
 
-        // check if we have both A and B tensors and apply
-        if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
-            lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
+        tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
+    }
 
-            ggml_tensor * dest_t = model_tensors[base_name];
+    bool warned = false;
+    int n_tensors = 0;
 
-            offload_func_t offload_func               = ggml_offload_nop;
-            offload_func_t offload_func_force_inplace = ggml_offload_nop;
+    // apply
+    ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+    if (backend_cpu == nullptr) {
+        LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
+        return 1;
+    }
+    ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
 
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-            if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
-                if (dest_t->type != GGML_TYPE_F16) {
-                    throw std::runtime_error(format(
-                        "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
-                }
-                offload_func = ggml_cuda_assign_buffers;
-                offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
-            }
-#endif // GGML_USE_CUBLAS
+    std::vector<no_init<uint8_t>> read_buf;
+    for (const auto & it : model.tensors_by_name) {
+        const std::string & base_name = it.first;
+        ggml_tensor * model_t = it.second;
 
-            ggml_tensor * base_t;
-            if (ml) {
-                struct gguf_context * ctx_gguf = ml->ctx_gguf;
+        if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
+            tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
+            continue;
+        }
 
-                // load from base model
-                if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
-                    LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
-                    return 1;
-                }
+        tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
+        tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
 
-                base_t = ml->get_tensor_meta(base_name.c_str());
-                ml->load_data_for(base_t);
-            } else {
-                base_t = dest_t;
-            }
+        ggml_init_params lora_init_params = {
+            /* .mem_size   */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+            /* .mem_buffer */ nullptr,
+            /* .no_alloc   */ true,
+        };
+        ggml_context * lora_ctx = ggml_init(lora_init_params);
+        if (lora_ctx == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
 
-            if (ggml_is_quantized(base_t->type)) {
-                if (!warned) {
-                    LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
-                                   "use a f16 or f32 base model with --lora-base\n", __func__);
-                    warned = true;
-                }
-            }
+        // create tensors
+        ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
+        ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
+        ggml_set_name(loraA, metaA.name.c_str());
+        ggml_set_name(loraB, metaB.name.c_str());
 
-            ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
-            GGML_ASSERT(loraA->type == GGML_TYPE_F32);
-            ggml_set_name(loraA, "loraA");
-
-            ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
-            GGML_ASSERT(loraB->type == GGML_TYPE_F32);
-            ggml_set_name(loraB, "loraB");
-
-            if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
-                LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
-                                " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+        ggml_tensor * base_t;
+        if (ml) {
+            if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
+                LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
                 return 1;
             }
+            base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
+        } else {
+            base_t = ggml_dup_tensor(lora_ctx, model_t);
+        }
+        ggml_set_name(base_t, base_name.c_str());
 
+        // allocate in backend buffer
+        ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (lora_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
+            return 1;
+        }
+
+        // load tensor data
+        auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
+            read_buf.resize(ggml_nbytes(tensor));
+            fin.seek(tensor_meta.offset, SEEK_SET);
+            fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
+            ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
+        };
+        load_tensor(metaA, loraA);
+        load_tensor(metaB, loraB);
+
+        // load base model tensor data
+        if (ml) {
+            ml->load_data_for(base_t);
+        } else {
+            ggml_backend_tensor_copy(model_t, base_t);
+        }
+
+        if (ggml_is_quantized(base_t->type) && !warned) {
+            LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+                            "use a f16 or f32 base model with --lora-base\n", __func__);
+            warned = true;
+        }
+
+        if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+            LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
+                            " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
+
+        auto build_lora_graph = [&]() {
             // w = w + BA*s
-            ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
-            offload_func(BA);
+            ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
             ggml_set_name(BA, "BA");
 
             if (scaling != 1.0f) {
-                BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
-                offload_func(BA);
+                BA = ggml_scale(lora_ctx, BA, scaling);
                 ggml_set_name(BA, "BA_scaled");
             }
 
             ggml_tensor * r;
-            if (base_t == dest_t) {
-                r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
-                offload_func_force_inplace(r);
-                ggml_set_name(r, "r_add_inplace");
-            }
-            else {
-                r = ggml_add(lora_ctx.get(), base_t, BA);
-                offload_func(r);
-                ggml_set_name(r, "r_add");
+            r = ggml_add_inplace(lora_ctx, base_t, BA);
+            ggml_set_name(r, "r_add");
 
-                r = ggml_cpy(lora_ctx.get(), r, dest_t);
-                offload_func(r);
-                ggml_set_name(r, "r_cpy");
+            if (base_t->type != model_t->type) {
+                // convert the result to the model type
+                r = ggml_cast(lora_ctx, r, model_t->type);
+                ggml_set_name(r, "r_cast");
             }
 
-            struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
-            ggml_build_forward_expand(gf, r);
+            return r;
+        };
 
-            ggml_graph_compute_helper(work_buffer, gf, n_threads);
+        ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+        ggml_tensor * r = build_lora_graph();
+        ggml_build_forward_expand(gf, r);
 
-            // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
-            GGML_ASSERT(lora_tensors.size() == 2);
+        ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+        if (graph_buf == nullptr) {
+            LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
+            ggml_free(lora_ctx);
+            ggml_backend_buffer_free(lora_buf);
+            ggml_backend_free(backend_cpu);
+            return 1;
+        }
 
-            // we won't need these tensors again, reset the context to save memory
-            lora_ctx.reset(ggml_init(params));
-            lora_tensors.clear();
+        ggml_backend_graph_compute(backend_cpu, gf);
 
-            n_tensors++;
-            if (n_tensors % 4 == 0) {
-                LLAMA_LOG_INFO(".");
-            }
+        ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
+
+#if 0
+        // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
+        //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
+
+        // sched compute
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_init_measure(sched, gf);
+
+        // create the graph again, since the previous one was destroyed by the measure
+        ggml_graph_clear(gf);
+        ggml_build_forward_expand(gf, build_graph());
+        ggml_backend_sched_graph_compute(sched, gf);
+        ggml_backend_sched_free(sched);
+#endif
+
+        ggml_backend_buffer_free(lora_buf);
+        ggml_backend_buffer_free(graph_buf);
+        ggml_free(lora_ctx);
+
+        n_tensors++;
+        if (n_tensors % 4 == 0) {
+            LLAMA_LOG_INFO(".");
         }
     }
 
+    ggml_backend_free(backend_cpu);
+
     const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
     LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
 
@@ -9705,6 +9203,7 @@ static int llama_apply_lora_from_file_internal(
 struct llama_model_params llama_model_default_params() {
     struct llama_model_params result = {
         /*.n_gpu_layers                =*/ 0,
+        /*.split_mode                  =*/ LLAMA_SPLIT_LAYER,
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ nullptr,
         /*.progress_callback           =*/ nullptr,
@@ -9716,7 +9215,8 @@ struct llama_model_params llama_model_default_params() {
     };
 
 #ifdef GGML_USE_METAL
-    result.n_gpu_layers = 1;
+    // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
+    result.n_gpu_layers = 999;
 #endif
 
     return result;
@@ -9912,41 +9412,51 @@ struct llama_context * llama_new_context_with_model(
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
-    // reserve memory for context buffers
     if (!hparams.vocab_only) {
-        // initialize backend
+        // initialize backends
 #ifdef GGML_USE_METAL
         if (model->n_gpu_layers > 0) {
-            ctx->backend = ggml_backend_metal_init();
-            if (ctx->backend == nullptr) {
+            ctx->backend_metal = ggml_backend_metal_init();
+            if (ctx->backend_metal == nullptr) {
                 LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
             }
+            ctx->backends.push_back(ctx->backend_metal);
         }
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-        // for testing only
+#elif defined(GGML_USE_CUBLAS)
         if (model->n_gpu_layers > 0) {
-            ctx->backend = ggml_backend_cuda_init(0);
-            if (ctx->backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
+            // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+            if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
+                ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
+            } else {
+                // LLAMA_SPLIT_LAYER requires a backend for each GPU
+                for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
+                    ggml_backend_t backend = ggml_backend_cuda_init(device);
+                    if (backend == nullptr) {
+                        LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
+                        llama_free(ctx);
+                        return nullptr;
+                    }
+                    ctx->backends.push_back(backend);
+                }
             }
         }
 #endif
-
-        if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
-            ctx->backend = ggml_backend_cpu_init();
-            if (ctx->backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
-            }
-        }
-
-        if (ctx->backend == nullptr) {
-            LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
+        ctx->backend_cpu = ggml_backend_cpu_init();
+        if (ctx->backend_cpu == nullptr) {
+            LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
             delete ctx;
             return nullptr;
         }
+        ctx->backends.push_back(ctx->backend_cpu);
 
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
-                cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
+        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
+                cparams.n_ctx, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
@@ -9982,11 +9492,11 @@ struct llama_context * llama_new_context_with_model(
         }
 
         {
-            // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
+            // buffer used to store the computation graph and the tensor meta data
             ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
 
-            // create measure allocator
-            ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
+            ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES);
+            ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
 
             // build worst-case graph
             int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -9994,50 +9504,19 @@ struct llama_context * llama_new_context_with_model(
             llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
 
-            // measure memory requirements for the graph
-            size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
+            // initialize scheduler with the worst-case graph
+            ggml_backend_sched_init_measure(ctx->sched, gf);
+            // note: the number of splits during measure is higher than during inference due to the kv shift
+            int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
+            LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
+            ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
 
-            LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
-
-            // create allocator again with exact memory requirements
-            ggml_allocr_free(ctx->alloc);
-
-            ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
-            ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
-            if (model->n_gpu_layers > 0) {
-                // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
-                ggml_cuda_set_scratch_size(alloc_size + 64);
-                LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
-
-                // calculate total VRAM usage
-                auto add_tensor = [](const ggml_tensor * t, size_t & size) {
-                    if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
-                        size += ggml_nbytes(t);
-                    }
-                };
-                size_t model_vram_size = 0;
-                for (const auto & kv : model->tensors_by_name) {
-                    add_tensor(kv.second, model_vram_size);
-                }
-
-                size_t kv_vram_size = 0;
-                for (auto & k : ctx->kv_self.k_l) {
-                    add_tensor(k, kv_vram_size);
-                }
-                for (auto & v : ctx->kv_self.v_l) {
-                    add_tensor(v, kv_vram_size);
-                }
-
-                size_t ctx_vram_size = alloc_size + kv_vram_size;
-                size_t total_vram_size = model_vram_size + ctx_vram_size;
-
-                LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
-                        total_vram_size / 1024.0 / 1024.0,
-                        model_vram_size / 1024.0 / 1024.0,
-                        ctx_vram_size   / 1024.0 / 1024.0);
+            for (ggml_backend_t backend : ctx->backends) {
+                ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
+                LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+                        ggml_backend_name(backend),
+                        ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
             }
-#endif
         }
 
         // TODO(jared): remove this
@@ -10157,9 +9636,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
 }
 
 int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "%s %s%s %s",
+    return snprintf(buf, buf_size, "%s %s %s",
             llama_model_arch_name(model->arch).c_str(),
-            model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
             llama_model_type_name(model->type),
             llama_model_ftype_name(model->ftype).c_str());
 }
@@ -10181,7 +9659,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
 }
 
 struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
-    return ggml_get_tensor(model->ctx, name);
+    auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
+            [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+                return it.first == name;
+            });
+    if (it == model->tensors_by_name.end()) {
+        return nullptr;
+    }
+    return it->second;
 }
 
 uint32_t llama_model_quantize(
@@ -10366,7 +9851,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
     const size_t s_embedding       = ctx->embedding.size() * sizeof(float);
     const size_t s_kv_size         = sizeof(size_t);
     const size_t s_kv_ntok         = sizeof(int);
-    const size_t s_kv              = ggml_backend_buffer_get_size(ctx->kv_self.buf);
+    const size_t s_kv              = ctx->kv_self.total_size();
 
     const size_t s_total = (
         + s_rng_size
@@ -10495,7 +9980,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         const auto   n_embd_v_gqa = hparams.n_embd_v_gqa();
         const auto   n_ctx        = cparams.n_ctx;
 
-        const size_t   kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
+        const size_t   kv_buf_size = kv_self.total_size();
         const uint32_t kv_head     = kv_self.head;
         const uint32_t kv_size     = kv_self.size;
         const uint32_t kv_used     = kv_self.used;
@@ -10508,46 +9993,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         if (kv_buf_size) {
             const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
-            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
-
-            std::vector<struct ggml_tensor *> kout2d(n_layer);
-            std::vector<struct ggml_tensor *> vout2d(n_layer);
-
-            for (int il = 0; il < (int) n_layer; ++il) {
-                kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
-                vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
-
-                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                        n_embd_k_gqa, kv_head,
-                        elt_size*n_embd_k_gqa, 0);
-
-                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
-                        kv_head, n_embd_v_gqa,
-                        elt_size*n_ctx, 0);
-
-                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
-                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
-            }
-
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
-
-            ggml_backend_graph_compute(ctx->backend, gf);
-
             std::vector<uint8_t> tmp_buf;
             for (int il = 0; il < (int) n_layer; ++il) {
-                tmp_buf.resize(ggml_nbytes(kout2d[il]));
-                ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
+                tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
+                ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
                 data_ctx->write(tmp_buf.data(), tmp_buf.size());
 
-                tmp_buf.resize(ggml_nbytes(vout2d[il]));
-                ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
-                data_ctx->write(tmp_buf.data(), tmp_buf.size());
+                // v is not contiguous, copy row by row
+                tmp_buf.resize(elt_size*kv_head);
+                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
+                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
+                }
             }
-
-            ggml_free(cpy_ctx);
-
-            ggml_backend_buffer_free(buf);
         }
 
         for (uint32_t i = 0; i < kv_size; ++i) {
@@ -10646,48 +10104,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
         memcpy(&kv_used,     inp, sizeof(kv_used));     inp += sizeof(kv_used);
 
         if (kv_buf_size) {
-            GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
+            GGML_ASSERT(kv_self.total_size() == kv_buf_size);
 
             const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
 
-            ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
-            ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
+            for (int il = 0; il < (int) n_layer; ++il) {
+                size_t k_size = elt_size*n_embd_k_gqa*kv_head;
+                ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
+                inp += k_size;
 
-            std::vector<struct ggml_tensor *> kin2d(n_layer);
-            std::vector<struct ggml_tensor *> vin2d(n_layer);
-
-            for (int il = 0; il < n_layer; ++il) {
-                kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
-                vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
-
-                ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
-                    n_embd_k_gqa, kv_head,
-                    elt_size*n_embd_k_gqa, 0);
-
-                ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
-                    kv_head, n_embd_v_gqa,
-                    elt_size*n_ctx, 0);
-
-                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
-                ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
+                // v is not contiguous, copy row by row
+                size_t v_row_size = elt_size*kv_head;
+                for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
+                    ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
+                    inp += v_row_size;
+                }
             }
-
-            ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
-
-            // load data into the tensors
-            for (int il = 0; il < n_layer; ++il) {
-                ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
-                inp += ggml_nbytes(kin2d[il]);
-
-                ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
-                inp += ggml_nbytes(vin2d[il]);
-            }
-
-            ggml_backend_graph_compute(ctx->backend, gf);
-
-            ggml_free(cpy_ctx);
-
-            ggml_backend_buffer_free(buf);
         }
 
         ctx->kv_self.head = kv_head;
diff --git a/llama.h b/llama.h
index 9fdb94bc4..7abc9d099 100644
--- a/llama.h
+++ b/llama.h
@@ -116,6 +116,12 @@ extern "C" {
         LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
     };
 
+    enum llama_split_mode {
+        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+    };
+
     typedef struct llama_token_data {
         llama_token id; // token id
         float logit;    // log-odds of the token
@@ -178,8 +184,15 @@ extern "C" {
 
     struct llama_model_params {
         int32_t n_gpu_layers; // number of layers to store in VRAM
-        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
-        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+        
+        // main_gpu interpretation depends on split_mode:
+        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_LAYER: ignored
+        int32_t main_gpu;
+        // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        const float * tensor_split;
 
         // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
         // If the provided progress_callback returns true, model loading continues.
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 7a60d7743..d9b8b106a 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -376,6 +376,11 @@ struct test_case {
 
         // allocate
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
+        if (buf == NULL) {
+            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
+            ggml_free(ctx);
+            return false;
+        }
 
         // build graph
         ggml_build_forward_expand(gf, out);
@@ -463,19 +468,23 @@ struct test_case {
             GGML_UNUSED(index);
         };
 
-        ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
+        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
 
-        if (ud.ok) {
-            printf("\033[1;32mOK\033[0m\n");
-        } else {
-            printf("\033[1;31mFAIL\033[0m\n");
+        if (!cmp_ok) {
+            printf("compare failed ");
         }
 
         ggml_backend_buffer_free(buf);
 
         ggml_free(ctx);
 
-        return ud.ok;
+        if (ud.ok && cmp_ok) {
+            printf("\033[1;32mOK\033[0m\n");
+            return true;
+        }
+
+        printf("\033[1;31mFAIL\033[0m\n");
+        return false;
     }
 
     bool eval_perf(ggml_backend_t backend, const char * op_name) {
@@ -519,6 +528,11 @@ struct test_case {
 
         // allocate
         ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        if (buf == NULL) {
+            printf("failed to allocate tensors\n");
+            ggml_free(ctx);
+            return false;
+        }
 
         // randomize tensors
         initialize_tensors(ctx);

From d5670d6e4639683d01556d71661b7b3d2e254583 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Tue, 9 Jan 2024 16:24:10 -0500
Subject: [PATCH 82/93] kompute : initial attempt at ggml-backend v2 support

---
 ggml-kompute.cpp | 181 +++++++++++++++++++++++++++++++++++++++++++++--
 ggml-kompute.h   |  16 +++++
 llama.cpp        | 148 ++++----------------------------------
 3 files changed, 203 insertions(+), 142 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index cc0adaf2f..01917ab01 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1,5 +1,7 @@
-#include "ggml-kompute.h"
 #include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-backend-impl.h"
+#include "ggml-kompute.h"
 
 // These are generated at build time by cmake custom command
 #include "shaderop_scale.h"
@@ -488,16 +490,28 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 }
 
 static
-decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    // compatibility with ggml-backend
+    if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) {
+        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context;
+
+        const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+
+        GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size);
+
+        offset = (uint64_t)ioffs;
+        return buf_ctx;
+     }
+
     for (auto it = ctx->buffers.begin(); ; it++) {
         if (it == ctx->buffers.end()) {
             fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
-            return it;
+            return nullptr;
         }
         if (it->data <= t->data &&
                 reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
             offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
-            return it;
+            return &*it;
         }
     }
 }
@@ -505,8 +519,8 @@ decltype(ggml_kompute_context::buffers)::iterator ggml_vk_find_tensor(struct ggm
 static
 const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
     uint64_t originalOffset = 0;
-    auto res = ggml_vk_find_tensor(ctx, t, originalOffset);
-    if (res == ctx->buffers.end()) {
+    auto * res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    if (!res) {
         static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
         return nullTensor;
     }
@@ -1629,3 +1643,158 @@ kp::TensorT<uint8_t>::dataType()
 {
     return TensorDataTypes::eUnsignedInt;
 }
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+    return "Kompute";
+}
+
+static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    auto * memory = (ggml_vk_memory *)buffer->context;
+    if (ggml_vk_has_device()) {
+        ggml_vk_free_memory(*memory);
+    }
+    delete memory;
+}
+
+static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return ((ggml_vk_memory *)buffer->context)->data;
+}
+
+static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    memcpy((char *)tensor->data + offset, data, size);
+    ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context);
+}
+
+static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context);
+    memcpy(data, (const char *)tensor->data + offset, size);
+}
+
+static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    auto * memory = (ggml_vk_memory *)buffer->context;
+    memset(memory->data, value, buffer->size);
+    ggml_vk_h2d_buffer(*memory);
+}
+
+static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
+    /* .get_name        = */ ggml_backend_kompute_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_kompute_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_kompute_buffer_get_base,
+    /* .init_tensor     = */ NULL,
+    /* .set_tensor      = */ ggml_backend_kompute_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_kompute_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+    /* .clear           = */ ggml_backend_kompute_buffer_clear,
+    /* .reset           = */ NULL,
+};
+
+// default buffer type
+
+static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return "Kompute";
+}
+
+static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
+    return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
+}
+
+static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return 32;
+}
+
+static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    GGML_UNUSED(buft);
+    return ggml_backend_is_kompute(backend);
+}
+
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
+        /* .iface = */ {
+            /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
+            /* .alloc_buffer     = */ ggml_backend_kompute_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_kompute_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
+            /* .is_host          = */ NULL,
+        },
+        /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_kompute;
+}
+
+// backend
+
+static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    return "Kompute";
+}
+
+static void ggml_backend_kompute_free(ggml_backend_t backend) {
+    struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
+    ggml_vk_free_device();
+    ggml_vk_free(ctx);
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+    return ggml_backend_kompute_buffer_type();
+}
+
+static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    auto * ctx = (ggml_kompute_context *)backend->context;
+    ggml_vk_graph_compute(ctx, cgraph);
+    return true;
+}
+
+static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+    GGML_UNUSED(backend);
+    GGML_UNUSED(op);
+    return true; // TODO: implement
+}
+
+static struct ggml_backend_i kompute_backend_i = {
+    /* .get_name                = */ ggml_backend_kompute_name,
+    /* .free                    = */ ggml_backend_kompute_free,
+    /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_kompute_graph_compute,
+    /* .supports_op             = */ ggml_backend_kompute_supports_op,
+};
+
+ggml_backend_t ggml_backend_kompute_init() {
+    if (!ggml_vk_has_device()) {
+        fprintf(stderr, "%s: error: device was not initialized\n", __func__);
+        return nullptr;
+    }
+
+    struct ggml_kompute_context * ctx = ggml_vk_init();
+
+    ggml_backend_t kompute_backend = new ggml_backend {
+        /* .interface = */ kompute_backend_i,
+        /* .context   = */ ctx,
+    };
+
+    return kompute_backend;
+}
+
+bool ggml_backend_is_kompute(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_kompute_name;
+}
diff --git a/ggml-kompute.h b/ggml-kompute.h
index ac8a4d4a0..f895dc545 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "ggml-backend.h"
+
 #include <cstddef>
 #include <vector>
 #include <string>
@@ -55,3 +57,17 @@ void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
 void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_API ggml_backend_t ggml_backend_kompute_init(void);
+
+GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
diff --git a/llama.cpp b/llama.cpp
index 3f2ae956f..0588250f2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -760,63 +760,6 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-// TODO(jared): remove this
-struct llama_buffer {
-    void * data = NULL;
-    size_t size = 0;
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_memory memory;
-#endif
-
-    // fallback to malloc / free
-    // useful in cases where CUDA can try to allocate PINNED memory
-    bool fallback = false;
-
-    void resize(size_t n) {
-        llama_host_free(data);
-
-#ifdef GGML_USE_KOMPUTE
-        if (ggml_vk_has_device()) {
-            this->memory = ggml_vk_allocate(n);
-            this->data = (uint8_t*)memory.data;
-            this->size = n;
-            return;
-        }
-#endif
-        data = llama_host_malloc(n);
-        if (!data) {
-            fallback = true;
-            data = malloc(n);
-        } else {
-            fallback = false;
-        }
-
-        GGML_ASSERT(data);
-        size = n;
-    }
-
-    ~llama_buffer() {
-        if (data) {
-#ifdef GGML_USE_KOMPUTE
-            if (memory.data) {
-                if (ggml_vk_has_device()) {
-                    ggml_vk_free_memory(memory);
-                }
-                data = NULL;
-                return;
-            }
-#endif
-            if (fallback) { // NOLINT
-                free(data);
-            } else {
-                llama_host_free(data);
-            }
-        }
-
-        data = NULL;
-    }
-};
-
 template <typename T>
 struct no_init {
     T value;
@@ -1288,6 +1231,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
     buft = ggml_backend_cuda_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
+#elif defined(GGML_USE_KOMPUTE)
+    buft = ggml_backend_kompute_buffer_type();
 #endif
 
     if (buft == nullptr) {
@@ -1721,11 +1666,6 @@ struct llama_context {
     // allocator for the input tensors
     ggml_tallocr * alloc = nullptr;
 
-// TODO(jared): remove this
-#if defined(GGML_USE_KOMPUTE)
-    ggml_kompute_context * ctx_kompute = NULL;
-#endif
-
     // temporary buffer for copying data to/from the backend
     std::vector<no_init<uint8_t>> buf_copy;
 
@@ -4362,10 +4302,6 @@ struct llm_build_context {
 
     std::vector<uint8_t> & buf_compute_meta;
 
-#ifdef GGML_USE_KOMPUTE
-    ggml_kompute_context * ctx_kompute;
-#endif
-
     struct ggml_context * ctx0 = nullptr;
 
     // TODO: consider making the entire interface noexcept
@@ -4405,10 +4341,6 @@ struct llm_build_context {
         do_rope_shift    (worst_case || kv_self.has_shift),
         cb               (cb),
         buf_compute_meta (lctx.buf_compute_meta)
-// TODO(jared): remove this
-#ifdef GGML_USE_KOMPUTE
-      , ctx_kompute   (lctx.ctx_kompute)
-#endif
         {
             // all initializations should be done in init()
         }
@@ -6028,11 +5960,6 @@ static struct ggml_cgraph * llama_build_graph(
     bool alloc_inp_KQ_mask  = false;
     bool alloc_inp_K_shift  = false;
 
-    // TODO(jared): do we still need this?
-#ifdef GGML_USE_KOMPUTE
-    const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute);
-#endif
-
     // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
     // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
     llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
@@ -6149,22 +6076,6 @@ static struct ggml_cgraph * llama_build_graph(
 
             alloc_inp_K_shift = true;
         }
-
-        // TODO(jared): this shouldn't be needed anymore
-#ifdef GGML_USE_KOMPUTE
-        if (lctx.ctx_kompute && !needs_h2d_all) {
-            const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"};
-            for (auto off : offload_tensors) {
-                if (strcmp(name, off) == 0) {
-                    ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
-                    break;
-                }
-            }
-            if (strcmp(name, "inp_embd") == 0 && !batch.token) {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, cur);
-            }
-        }
-#endif
     };
 
     struct ggml_cgraph * result = NULL;
@@ -6230,12 +6141,6 @@ static struct ggml_cgraph * llama_build_graph(
             GGML_ASSERT(false);
     }
 
-#ifdef GGML_USE_KOMPUTE
-        if (needs_h2d_all) {
-            ggml_vk_h2d_all(lctx.ctx_kompute);
-        }
-#endif
-
     llm.free();
 
     return result;
@@ -6374,25 +6279,6 @@ static int llama_decode_internal(
     if (ggml_backend_is_metal(lctx.backend_metal)) {
         ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
     }
-#elif defined(GGML_USE_KOMPUTE)
-    if (lctx.ctx_kompute && n_tokens == 1) {
-        ggml_vk_graph_compute(lctx.ctx_kompute, gf);
-        ggml_vk_d2h_tensor(lctx.ctx_kompute, res);
-    } else {
-        if (lctx.ctx_kompute) {
-            for (int il = 0; il < hparams.n_layer; ++il) {
-                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
-                ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
-            }
-        }
-        ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
-        if (lctx.ctx_kompute) {
-            for (int il = 0; il < hparams.n_layer; ++il) {
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.k_l[il]);
-                ggml_vk_h2d_tensor(lctx.ctx_kompute, kv_self.v_l[il]);
-            }
-        }
-    }
 #endif
 
     if (lctx.backend_cpu != nullptr) {
@@ -9446,6 +9332,16 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
         }
+#elif defined(GGML_USE_KOMPUTE)
+        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
+            auto * backend = ggml_backend_kompute_init();
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
+        }
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
@@ -9518,23 +9414,6 @@ struct llama_context * llama_new_context_with_model(
                         ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
             }
         }
-
-        // TODO(jared): remove this
-#if defined(GGML_USE_KOMPUTE)
-        if (ggml_vk_has_device() && model->n_gpu_layers > 0) {
-            // this allocates all Vulkan resources and memory buffers
-            ctx->ctx_kompute = ggml_vk_init();
-
-            const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
-
-            printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
-
-            ggml_vk_add_buffer(ctx->ctx_kompute, "data", ctx->model.buf.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->kv_self.buf.memory);
-            ggml_vk_add_buffer(ctx->ctx_kompute, "alloc", ctx->buf_alloc.memory);
-        }
-#endif
     }
 
 #ifdef GGML_USE_MPI
@@ -9555,9 +9434,6 @@ struct llama_context * llama_new_context_with_model(
 }
 
 void llama_free(struct llama_context * ctx) {
-#ifdef GGML_USE_KOMPUTE
-    ggml_vk_free(ctx->ctx_kompute);
-#endif
     delete ctx;
 #ifdef GGML_USE_KOMPUTE
     ggml_vk_free_device();

From 8a99f698954fc81d1b8c82350299d6a25bba5aa4 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 13:44:34 -0500
Subject: [PATCH 83/93] fix assertion failure

---
 ggml-kompute.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 01917ab01..9b0ed3a22 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -491,9 +491,11 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 
 static
 ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
     // compatibility with ggml-backend
-    if (t->buffer && t->buffer->buft == ggml_backend_kompute_buffer_type()) {
-        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) t->buffer->context;
+    if (buffer && buffer->buft == ggml_backend_kompute_buffer_type()) {
+        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
 
         const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 

From 50579f27e90e8dc9859e5da5cac4111ffca16b5a Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 10 Jan 2024 16:14:03 -0500
Subject: [PATCH 84/93] attempt to get test-backend-ops working

---
 ggml-backend.c   |  6 +++
 ggml-kompute.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++---
 ggml-kompute.h   |  8 ++++
 3 files changed, 108 insertions(+), 5 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 535426b9a..7a0a426e8 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -314,6 +314,12 @@ static void ggml_backend_registry_init(void) {
     extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
     ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
+
+#ifdef GGML_USE_KOMPUTE
+    extern ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+    extern ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+    ggml_backend_register("Kompute", ggml_backend_reg_kompute_init, ggml_backend_kompute_buffer_type(), NULL);
+#endif
 }
 
 void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 9b0ed3a22..734f973b6 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -499,7 +499,7 @@ ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct g
 
         const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-        GGML_ASSERT(ioffs >= 0 && ioffs + ggml_nbytes(t) <= (int64_t)t->buffer->size);
+        GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
 
         offset = (uint64_t)ioffs;
         return buf_ctx;
@@ -1344,6 +1344,82 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
     ggml_vk_cpy<2, 4>(spirv, std::forward<Args>(args)...);
 }
 
+static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
+    switch (op->type) {
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            break;
+        default:
+            return false;
+    }
+
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                    return true;
+                default:
+                    ;
+            }
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_CONCAT:
+        case GGML_OP_ADD:
+        case GGML_OP_ACC:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_NORM:
+        case GGML_OP_ALIBI:
+        case GGML_OP_ROPE:
+        case GGML_OP_IM2COL:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            return true;
+        case GGML_OP_DUP:
+        case GGML_OP_CPY:
+        case GGML_OP_CONT:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    break;
+                default:
+                    return false;
+            }
+            switch (op->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                    break;
+                default:
+                    return false;
+            }
+            return true;
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_GET_ROWS:
+            return op->ne[3] == 1;
+        default:
+            ;
+    }
+    return false;
+}
+
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
     const int n_seq = 8;
 
@@ -1362,7 +1438,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         auto& seq = *sequences[seq_idx];
 
         const int node_start = (seq_idx + 0) * n_nodes_per_seq;
-        const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
+        const int node_end   = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
 
         for (int i = node_start; i < node_end; ++i) {
             struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@@ -1381,6 +1457,11 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     break;
             }
 
+            if (!ggml_kompute_supports_op(dst)) {
+                 fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+                 GGML_ASSERT(!"unsupported op");
+             }
+
             const int32_t ne00 = src0 ? src0->ne[0] : 0;
             const int32_t ne01 = src0 ? src0->ne[1] : 0;
             const int32_t ne02 = src0 ? src0->ne[2] : 0;
@@ -1718,7 +1799,7 @@ static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffe
     return ggml_backend_is_kompute(backend);
 }
 
-ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void) {
+ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_kompute = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_kompute_buffer_type_get_name,
@@ -1761,8 +1842,7 @@ static bool ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct gg
 
 static bool ggml_backend_kompute_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     GGML_UNUSED(backend);
-    GGML_UNUSED(op);
-    return true; // TODO: implement
+    return ggml_kompute_supports_op(op);
 }
 
 static struct ggml_backend_i kompute_backend_i = {
@@ -1800,3 +1880,12 @@ ggml_backend_t ggml_backend_kompute_init() {
 bool ggml_backend_is_kompute(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_kompute_name;
 }
+
+extern "C" ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data);
+
+ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
+    GGML_UNUSED(params);
+    GGML_UNUSED(user_data);
+    ggml_vk_init_device(0, "gpu");
+    return ggml_backend_kompute_init();
+}
diff --git a/ggml-kompute.h b/ggml-kompute.h
index f895dc545..e8d2d396b 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -63,6 +63,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
 // user-code should use only these functions
 //
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 
@@ -71,3 +75,7 @@ GGML_API ggml_backend_t ggml_backend_kompute_init(void);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(void);
+
+#ifdef __cplusplus
+}
+#endif

From de9b0bbbe4082eb8fa32760eac9962fc31aa0cbc Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 17 Jan 2024 10:09:27 -0500
Subject: [PATCH 85/93] add sanity check and fix kompute teardown order

---
 ggml-kompute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 734f973b6..4d0dd2755 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -301,6 +301,7 @@ ggml_vk_device ggml_vk_current_device() {
 }
 
 ggml_kompute_context *ggml_vk_init() {
+    GGML_ASSERT(s_kompute_context == nullptr);
     s_kompute_context = new ggml_kompute_context;
     return s_kompute_context;
 }
@@ -1824,8 +1825,8 @@ static const char * ggml_backend_kompute_name(ggml_backend_t backend) {
 
 static void ggml_backend_kompute_free(ggml_backend_t backend) {
     struct ggml_kompute_context * ctx = (struct ggml_kompute_context *)backend->context;
-    ggml_vk_free_device();
     ggml_vk_free(ctx);
+    ggml_vk_free_device();
     delete backend;
 }
 

From 02b9bafe29dfa929b6c5d8e6b80bcd23cb1172f0 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Wed, 17 Jan 2024 13:47:03 -0500
Subject: [PATCH 86/93] kompute : ignore exceptions in
 ggml_vk_available_devices (#12)

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
---
 ggml-kompute.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 4d0dd2755..10f94f18c 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -147,9 +147,15 @@ std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired) {
     if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance())
         return results;
 
-    std::vector<vk::PhysicalDevice> physicalDevices = komputeManager()->listDevices();
-    uint32_t deviceCount = physicalDevices.size();
+    std::vector<vk::PhysicalDevice> physicalDevices;
+    try {
+        physicalDevices = komputeManager()->listDevices();
+    } catch (vk::SystemError & err) {
+        std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n";
+        return results;
+    }
 
+    uint32_t deviceCount = physicalDevices.size();
     if (deviceCount == 0)
         return results;
 

From 696faa866059cb6e227ac3543bb274adac88b8ab Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 18 Jan 2024 18:49:39 +0200
Subject: [PATCH 87/93] kompute : fix rope_f32 and scale ops (#5008)

---
 ggml-kompute.cpp                 |  3 ++-
 kompute-shaders/op_rope_f32.comp | 38 +++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 10f94f18c..0f0003c48 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1540,7 +1540,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        const float scale = *(const float *) src1->data;
+                        float scale; memcpy(&scale, dst->op_params, sizeof(float));
+
                         ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale);
                     } break;
                 case GGML_OP_UNARY:
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
index 104ae0ba4..2adf5eb4e 100644
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@@ -35,31 +35,39 @@ void main() {
             const float x0 = inA[src];
             const float x1 = inA[src+1];
 
-            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data]   = x0*cos_theta - x1*sin_theta;
             out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
         }
     } else {
         const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
-            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const uint cur_rot = ib * pcs.n_dims + ic;
+        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+            const uint cur_rot = ic;
 
-                float cos_theta, sin_theta;
-                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
-                theta *= theta_scale;
+            theta *= theta_scale;
 
-                const uint i0 = ib*pcs.n_dims + ic/2;
+            const uint i0 = ic/2;
 
-                const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
-                const float x0 = inA[src];
-                const float x1 = inA[src+pcs.n_dims/2];
+            const float x0 = inA[src];
+            const float x1 = inA[src+pcs.n_dims/2];
 
-                out_[dst_data] = x0*cos_theta - x1*sin_theta;
-                out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
-            }
+            out_[dst_data] = x0*cos_theta - x1*sin_theta;
+            out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        }
+
+        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
+            const uint i0 = ic;
+
+            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+
+            out_[dst_data + 0] = inA[src + 0];
+            out_[dst_data + 1] = inA[src + 1];
         }
     }
 }

From a97935e098cab7a7bbb8d3820ac0f5eb3184fd8e Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:48:12 -0500
Subject: [PATCH 88/93] clean up old backend code

---
 ggml-kompute.cpp | 128 +++++++++++++++++------------------------------
 ggml-kompute.h   |  12 -----
 2 files changed, 45 insertions(+), 95 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 0f0003c48..488f7899c 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -56,8 +56,6 @@
 
 typedef ggml_fp16_t half;
 struct ggml_kompute_context {
-    bool hasH2DAll = false;
-    std::vector<ggml_vk_memory> buffers;
     std::shared_ptr<vk::DescriptorPool> pool;
 };
 
@@ -312,10 +310,6 @@ ggml_kompute_context *ggml_vk_init() {
     return s_kompute_context;
 }
 
-bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx) {
-    return ctx->hasH2DAll;
-}
-
 void ggml_vk_free(struct ggml_kompute_context * ctx) {
     assert(ctx == s_kompute_context);
     s_kompute_context = nullptr;
@@ -414,9 +408,8 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     return vkDeviceMemory;
 }
 
-size_t ggml_vk_aligned_offset(size_t offset) {
-
-    static size_t minStorageBufferOffsetAlignment = 0;
+static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) {
+    size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer);
     if (minStorageBufferOffsetAlignment == 0) {
         vk::PhysicalDeviceProperties deviceProperties;
         deviceProperties = komputeManager()->physicalDevice()->getProperties();
@@ -433,17 +426,7 @@ size_t ggml_vk_aligned_offset(size_t offset) {
     return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment;
 }
 
-static void ggml_vk_h2d_buffer(const ggml_vk_memory &memory) {
-    if (memory.stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
-}
-
-static void ggml_vk_d2h_buffer(const ggml_vk_memory &memory) {
-    if (memory.stagingBuffer)
-        komputeManager()->sequence()->eval<kp::OpBufferSyncLocal>(memory.primaryBuffer, memory.stagingBuffer, memory.size);
-}
-
-ggml_vk_memory ggml_vk_allocate(size_t size) {
+static ggml_vk_memory ggml_vk_allocate(size_t size) {
     ggml_vk_memory memory;
     bool isHostVisible = false;
     {
@@ -497,38 +480,26 @@ void ggml_vk_free_memory(ggml_vk_memory &memory)
 }
 
 static
-ggml_vk_memory * ggml_vk_find_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint64_t & offset) {
+ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) {
     ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
 
     // compatibility with ggml-backend
-    if (buffer && buffer->buft == ggml_backend_kompute_buffer_type()) {
-        ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
+    GGML_ASSERT(buffer && buffer->buft == ggml_backend_kompute_buffer_type());
 
-        const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
+    ggml_vk_memory * buf_ctx = (ggml_vk_memory *) buffer->context;
 
-        GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
+    const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-        offset = (uint64_t)ioffs;
-        return buf_ctx;
-     }
+    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
 
-    for (auto it = ctx->buffers.begin(); ; it++) {
-        if (it == ctx->buffers.end()) {
-            fprintf(stderr, "%s: Failed to find tensor %p\n", __func__, t->data);
-            return nullptr;
-        }
-        if (it->data <= t->data &&
-                reinterpret_cast<intptr_t>(it->data) + it->size >= (reinterpret_cast<intptr_t>(t->data) + ggml_nbytes(t))) {
-            offset = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(it->data);
-            return &*it;
-        }
-    }
+    offset = (uint64_t)ioffs;
+    return buf_ctx;
 }
 
 static
-const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t, uint32_t *alignedOffset) {
+const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) {
     uint64_t originalOffset = 0;
-    auto * res = ggml_vk_find_tensor(ctx, t, originalOffset);
+    auto * res = ggml_vk_find_tensor(t, originalOffset);
     if (!res) {
         static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
         return nullTensor;
@@ -538,7 +509,7 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
     const size_t nelements = ggml_nelements(t);
     size_t nbytes = ggml_nbytes(t);
 
-    size_t vulkanOffset = ggml_vk_aligned_offset(originalOffset);
+    size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset);
     if (alignedOffset) {
         *alignedOffset = originalOffset - vulkanOffset;
         nbytes += *alignedOffset;
@@ -553,39 +524,6 @@ const std::shared_ptr<kp::Tensor> ggml_vk_get_tensor(struct ggml_kompute_context
         vulkanOffset);
 }
 
-void ggml_vk_add_buffer(
-        struct ggml_kompute_context * ctx,
-        const char * /*name*/,
-        const ggml_vk_memory &memory) {
-    ctx->buffers.emplace_back(memory);
-}
-
-void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
-    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
-    GGML_ASSERT(res);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
-}
-
-void ggml_vk_h2d_all(struct ggml_kompute_context * ctx) {
-    for (auto& it : ctx->buffers) {
-        ggml_vk_h2d_buffer(it);
-    }
-    ctx->hasH2DAll = true;
-}
-
-void ggml_vk_d2h_all(struct ggml_kompute_context * ctx) {
-    for (auto& it : ctx->buffers) {
-        ggml_vk_d2h_buffer(it);
-    }
-}
-
-void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
-    const auto res = ggml_vk_get_tensor(ctx, t, nullptr);
-
-    GGML_ASSERT(res);
-    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
-}
-
 static std::vector<uint32_t> getSpirvShader(const unsigned char* rawData, size_t size) {
     if (size % sizeof(uint32_t) != 0) {
         throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)");
@@ -1506,10 +1444,10 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
             uint32_t off_src0 = 0;
             uint32_t off_src1 = 0;
-            uint32_t off_dst = 0;
-            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0, &off_src0) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1, &off_src1) : nullTensor;
-            const std::shared_ptr<kp::Tensor>& id_dst  = dst ? ggml_vk_get_tensor(ctx, dst, &off_dst)  : nullTensor;
+            uint32_t off_dst  = 0;
+            const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
+            const std::shared_ptr<kp::Tensor>& id_dst  = dst  ? ggml_vk_get_tensor(dst,  &off_dst)  : nullTensor;
 
             switch (dst->op) {
                 case GGML_OP_ADD:
@@ -1757,19 +1695,33 @@ static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer)
 }
 
 static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+
+    const auto res = ggml_vk_get_tensor(tensor);
+    GGML_ASSERT(res);
+
     memcpy((char *)tensor->data + offset, data, size);
-    ggml_vk_h2d_buffer(*(ggml_vk_memory *)buffer->context);
+
+    komputeManager()->sequence()->eval<kp::OpTensorSyncDevice>({res});
 }
 
 static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    ggml_vk_d2h_buffer(*(ggml_vk_memory *)buffer->context);
+    GGML_UNUSED(buffer);
+
+    const auto res = ggml_vk_get_tensor(tensor);
+    GGML_ASSERT(res);
+
+    komputeManager()->sequence()->eval<kp::OpTensorSyncLocal>({res});
+
     memcpy(data, (const char *)tensor->data + offset, size);
 }
 
 static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     auto * memory = (ggml_vk_memory *)buffer->context;
     memset(memory->data, value, buffer->size);
-    ggml_vk_h2d_buffer(*memory);
+
+    if (memory->stagingBuffer)
+        komputeManager()->sequence()->eval<kp::OpBufferSyncDevice>(memory->primaryBuffer, memory->stagingBuffer, memory->size);
 }
 
 static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
@@ -1799,7 +1751,17 @@ static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_
 
 static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
-    return 32;
+
+    static size_t minStorageBufferOffsetAlignment = 0;
+    if (minStorageBufferOffsetAlignment == 0) {
+        GGML_ASSERT(ggml_vk_has_device());
+        vk::PhysicalDeviceProperties deviceProperties;
+        deviceProperties = komputeManager()->physicalDevice()->getProperties();
+        vk::PhysicalDeviceLimits deviceLimits = deviceProperties.limits;
+        minStorageBufferOffsetAlignment = deviceLimits.minStorageBufferOffsetAlignment;
+    }
+
+    return minStorageBufferOffsetAlignment;
 }
 
 static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
diff --git a/ggml-kompute.h b/ggml-kompute.h
index e8d2d396b..288c835c5 100644
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@@ -41,21 +41,9 @@ bool ggml_vk_has_device();
 bool ggml_vk_using_vulkan();
 ggml_vk_device ggml_vk_current_device();
 struct ggml_kompute_context * ggml_vk_init(void);
-bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
 void ggml_vk_free(struct ggml_kompute_context * ctx);
-size_t ggml_vk_aligned_offset(size_t offset);
-ggml_vk_memory ggml_vk_allocate(size_t size);
 void ggml_vk_free_memory(ggml_vk_memory &memory);
 
-void ggml_vk_add_buffer(
-    struct ggml_kompute_context * ctx,
-    const char * name,
-    const ggml_vk_memory &memory);
-
-void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
-void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
-void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 
 //

From 0f1a958a518e716e6ffd3b481e11e0c59b83f0a7 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:48:27 -0500
Subject: [PATCH 89/93] actually fix this assertion

---
 ggml-kompute.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 488f7899c..146352c38 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -490,7 +490,7 @@ ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & of
 
     const intptr_t ioffs = reinterpret_cast<intptr_t>(t->data) - reinterpret_cast<intptr_t>(buf_ctx->data);
 
-    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)t->buffer->size);
+    GGML_ASSERT(ioffs >= 0 && ioffs + (int64_t)ggml_nbytes(t) <= (int64_t)buffer->size);
 
     offset = (uint64_t)ioffs;
     return buf_ctx;

From 16bc3c3be8f488fd0f1bd02bfb3a4a8faa655948 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 11:56:00 -0500
Subject: [PATCH 90/93] sync op_rope_f16 with recent op_rope_f32 changes

---
 kompute-shaders/op_rope_f16.comp | 36 +++++++++++++++++++-------------
 kompute-shaders/op_rope_f32.comp |  8 +++----
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/kompute-shaders/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
index 3abe3ed33..b44622584 100644
--- a/kompute-shaders/op_rope_f16.comp
+++ b/kompute-shaders/op_rope_f16.comp
@@ -40,26 +40,34 @@ void main() {
         }
     } else {
         const float inv_ndims = -1.f/pcs.n_dims;
-        for (uint ib = 0; ib < pcs.ne0/pcs.n_dims; ++ib) {
-            for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
-                const uint cur_rot = ib * pcs.n_dims + ic;
+        for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
+            const uint cur_rot = ic;
 
-                float cos_theta, sin_theta;
-                rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
+            float cos_theta, sin_theta;
+            rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
 
-                theta *= theta_scale;
+            theta *= theta_scale;
 
-                const uint i0 = ib*pcs.n_dims + ic/2;
+            const uint i0 = ic/2;
 
-                const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
-                const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
 
-                const float x0 = float(inA[src]);
-                const float x1 = float(inA[src+pcs.n_dims/2]);
+            const float x0 = float(inA[src]);
+            const float x1 = float(inA[src+pcs.n_dims/2]);
 
-                out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
-                out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
-            }
+            out_[dst_data]              = float16_t(x0*cos_theta - x1*sin_theta);
+            out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
+        }
+
+        for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
+            const uint i0 = ic;
+
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 2) + pcs.outOff; // Based from out_
+
+            out_[dst_data + 0] = inA[src + 0];
+            out_[dst_data + 1] = inA[src + 1];
         }
     }
 }
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
index 2adf5eb4e..2c0235d75 100644
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@@ -29,8 +29,8 @@ void main() {
 
             theta *= theta_scale;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
 
             const float x0 = inA[src];
             const float x1 = inA[src+1];
@@ -50,8 +50,8 @@ void main() {
 
             const uint i0 = ic/2;
 
-            const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
-            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
+            const uint src      = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
+            const uint dst_data = uint((i3*pcs.nb3  + i2*pcs.nb2  + i1*pcs.nb1  + i0*pcs.nb0)  / 4) + pcs.outOff; // Based from out_
 
             const float x0 = inA[src];
             const float x1 = inA[src+pcs.n_dims/2];

From 7addf2b878ba3b070bd55b57cc2af1fb8973c442 Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 16:11:00 -0500
Subject: [PATCH 91/93] never try to evaluate an empty command buffer

This fixes the immediate crashes with test-backend-ops - when
evaluatating individual no-ops like OP_VIEW, it tries to submit an empty
command buffer, which crashes RADV and hangs AMDVLK.
---
 ggml-kompute.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 146352c38..007367611 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1385,6 +1385,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         const int node_start = (seq_idx + 0) * n_nodes_per_seq;
         const int node_end   = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes);
 
+        bool any_commands_recorded = false;
+
         for (int i = node_start; i < node_end; ++i) {
             struct ggml_tensor * src0 = gf->nodes[i]->src[0];
             struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@@ -1402,6 +1404,8 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
                     break;
             }
 
+            any_commands_recorded = true;
+
             if (!ggml_kompute_supports_op(dst)) {
                  fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
                  GGML_ASSERT(!"unsupported op");
@@ -1647,7 +1651,9 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
         }
 
         // Evaluate sequence
-        seq.evalAsync();
+        if (any_commands_recorded) {
+            seq.evalAsync();
+        }
     }
 
     // Wait for all sequences to finish

From 610394fff83368d5465b62f8c8add3737a39e42a Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 18 Jan 2024 15:32:55 -0500
Subject: [PATCH 92/93] fix supported ops for kompute backend

---
 ggml-kompute.cpp           | 41 ++++++++++++++++++++++++--------------
 tests/test-backend-ops.cpp |  5 ++++-
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 007367611..720a66986 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1316,27 +1316,13 @@ static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
         case GGML_OP_VIEW:
         case GGML_OP_TRANSPOSE:
         case GGML_OP_PERMUTE:
-        case GGML_OP_CONCAT:
         case GGML_OP_ADD:
-        case GGML_OP_ACC:
         case GGML_OP_MUL:
-        case GGML_OP_DIV:
         case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_SUM_ROWS:
         case GGML_OP_SOFT_MAX:
         case GGML_OP_RMS_NORM:
-        case GGML_OP_GROUP_NORM:
         case GGML_OP_NORM:
-        case GGML_OP_ALIBI:
         case GGML_OP_ROPE:
-        case GGML_OP_IM2COL:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
             return true;
         case GGML_OP_DUP:
         case GGML_OP_CPY:
@@ -1357,8 +1343,33 @@ static bool ggml_kompute_supports_op(const struct ggml_tensor * op) {
             }
             return true;
         case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_GET_ROWS:
             return op->ne[3] == 1;
+        case GGML_OP_GET_ROWS:
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q6_K:
+                    return op->ne[3] == 1;
+                default:
+                    ;
+            }
+            return false;
+        case GGML_OP_MUL_MAT:
+            if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1]))
+                return false;
+
+            switch (op->src[0]->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q4_1:
+                case GGML_TYPE_Q6_K:
+                    return true;
+                default:
+                    ;
+            }
         default:
             ;
     }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d9b8b106a..a0063bbb9 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -360,7 +360,10 @@ struct test_case {
         // check if backends support op
         bool supported = true;
         for (ggml_backend_t backend : {backend1, backend2}) {
-            if (!ggml_backend_supports_op(backend, out)) {
+            if (
+                !ggml_backend_supports_op(backend, out)
+                || (op_desc(out) == "MOE" && !strcmp(ggml_backend_name(backend), "Kompute"))
+            ) {
                 printf("not supported [%s] ", ggml_backend_name(backend));
                 supported = false;
             }

From 145321516506b9cfda00b62b669ac4fa639ecc5c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 19 Jan 2024 00:09:16 +0200
Subject: [PATCH 93/93] kompute : fix ggml_add kernel

---
 ggml-kompute.cpp            |  2 +-
 kompute-shaders/op_add.comp | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index 720a66986..520cd1fd7 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1467,7 +1467,7 @@ void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph
             switch (dst->op) {
                 case GGML_OP_ADD:
                     {
-                        if (ggml_nelements(src1) == ne10 && ne00 % 4 == 0) {
+                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                             // src1 is a row
                             ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00);
                         } else {
diff --git a/kompute-shaders/op_add.comp b/kompute-shaders/op_add.comp
index c86673452..b7b76a79d 100644
--- a/kompute-shaders/op_add.comp
+++ b/kompute-shaders/op_add.comp
@@ -30,6 +30,7 @@ layout(push_constant) uniform PushConstants {
     int nb1;
     int nb2;
     int nb3;
+  //int offs; // TODO: needed for GGML_OP_ACC, see metal code
 } pcs;
 
 // general-purpose kernel for addition of two tensors
@@ -44,15 +45,14 @@ void main() {
     const uint i12 = i02 % pcs.ne12;
     const uint i11 = i01 % pcs.ne11;
 
-    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + gl_SubgroupInvocationID.x*pcs.nb00) / 4);
-    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 + gl_SubgroupInvocationID.x*pcs.nb10) / 4);
-    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + gl_SubgroupInvocationID.x*pcs.nb0 ) / 4);
+    int offs = 0; // TMP (see above)
+
+    uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4);
+    uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11       ) / 4);
+    uint dst_off  = uint((i03*pcs.nb3  + i02*pcs.nb2  + i01*pcs.nb1  + offs) / 4);
 
     for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) {
-        out_[pcs.outOff + dst_off] = inA[pcs.inAOff + src0_off] + inB[pcs.inBOff + src1_off];
-
-        src0_off += gl_WorkGroupSize.x*pcs.ne00;
-        src1_off += gl_WorkGroupSize.x*pcs.ne10;
-        dst_off  += gl_WorkGroupSize.x*pcs.ne0;
+        const uint i10 = i0 % pcs.ne10;
+        out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10];
     }
 }