diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..4a068a698
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "kompute"]
+ path = kompute
+ url = https://github.com/KomputeProject/kompute.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc7560a7a..cae411109 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
+option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@@ -309,6 +310,22 @@ if (LLAMA_CLBLAST)
endif()
endif()
+if (LLAMA_KOMPUTE)
+ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
+ message(STATUS "Kompute found")
+
+ add_subdirectory(kompute)
+
+ set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ggml-vulkan.h)
+
+ add_compile_definitions(GGML_USE_KOMPUTE)
+
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+ else()
+ message(WARNING "Kompute not found")
+ endif()
+endif()
+
if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
@@ -466,6 +483,7 @@ add_library(ggml OBJECT
ggml.h
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
+ ${GGML_SOURCES_KOMPUTE}
${GGML_SOURCES_METAL}
${GGML_SOURCES_EXTRA}
)
diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user
new file mode 100644
index 000000000..e7d373827
--- /dev/null
+++ b/CMakeLists.txt.user
@@ -0,0 +1,454 @@
+
+
+
+
+
+ EnvironmentId
+ {f3929b0b-3d39-4fa3-8d2d-2b329b63b30c}
+
+
+ ProjectExplorer.Project.ActiveTarget
+ 0
+
+
+ ProjectExplorer.Project.EditorSettings
+
+ true
+ false
+ true
+
+ Cpp
+
+ CppGlobal
+
+
+
+ QmlJS
+
+ QmlJSGlobal
+
+
+ 2
+ UTF-8
+ false
+ 4
+ false
+ 80
+ true
+ true
+ 1
+ false
+ true
+ false
+ 0
+ true
+ true
+ 0
+ 8
+ true
+ false
+ 1
+ true
+ true
+ true
+ *.md, *.MD, Makefile
+ false
+ true
+ true
+
+
+
+ ProjectExplorer.Project.PluginSettings
+
+
+ true
+ false
+ true
+ true
+ true
+ true
+
+
+ 0
+ true
+
+ true
+ true
+ Builtin.DefaultTidyAndClazy
+ 6
+
+
+
+ true
+
+
+ true
+
+
+
+
+ ProjectExplorer.Project.Target.0
+
+ Desktop
+ Clang
+ Clang
+ {913660d6-ca1c-4b66-a4da-64108a3258a2}
+ 0
+ 0
+ 0
+
+ Release
+ false
+
+ -DCMAKE_GENERATOR:STRING=Unix Makefiles
+-DCMAKE_BUILD_TYPE:STRING=Release
+-DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable}
+-DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX}
+-DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C}
+-DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx}
+-DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG}
+ /mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release
+
+
+
+
+ all
+
+ false
+
+ true
+ Build
+ CMakeProjectManager.MakeStep
+
+ 1
+ Build
+ Build
+ ProjectExplorer.BuildSteps.Build
+
+
+
+
+
+ clean
+
+ false
+
+ true
+ Build
+ CMakeProjectManager.MakeStep
+
+ 1
+ Clean
+ Clean
+ ProjectExplorer.BuildSteps.Clean
+
+ 2
+ false
+
+ false
+
+ Release
+ CMakeProjectManager.CMakeBuildConfiguration
+
+ 1
+
+
+ 0
+ Deploy
+ Deploy
+ ProjectExplorer.BuildSteps.Deploy
+
+ 1
+
+ false
+ ProjectExplorer.DefaultDeployConfiguration
+
+ 1
+
+ true
+ true
+ true
+
+ 2
+
+ baby-llama
+ CMakeProjectManager.CMakeRunConfiguration.baby-llama
+ baby-llama
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ benchmark
+ CMakeProjectManager.CMakeRunConfiguration.benchmark
+ benchmark
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ test-quantize-perf
+ CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf
+ test-quantize-perf
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ test-sampling
+ CMakeProjectManager.CMakeRunConfiguration.test-sampling
+ test-sampling
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ test-tokenizer-0
+ CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0
+ test-tokenizer-0
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ train-text-from-scratch
+ CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch
+ train-text-from-scratch
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ vdot
+ CMakeProjectManager.CMakeRunConfiguration.vdot
+ vdot
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ simple
+ CMakeProjectManager.CMakeRunConfiguration.simple
+ simple
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ embedding
+ CMakeProjectManager.CMakeRunConfiguration.embedding
+ embedding
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ main
+ CMakeProjectManager.CMakeRunConfiguration.main
+ main
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ perplexity
+ CMakeProjectManager.CMakeRunConfiguration.perplexity
+ perplexity
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ q8dot
+ CMakeProjectManager.CMakeRunConfiguration.q8dot
+ q8dot
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ quantize
+ CMakeProjectManager.CMakeRunConfiguration.quantize
+ quantize
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ quantize-stats
+ CMakeProjectManager.CMakeRunConfiguration.quantize-stats
+ quantize-stats
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ save-load-state
+ CMakeProjectManager.CMakeRunConfiguration.save-load-state
+ save-load-state
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+
+ true
+ true
+ true
+
+ 2
+
+ test-quantize-fns
+ CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns
+ test-quantize-fns
+ false
+ true
+ true
+ false
+ true
+ /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin
+
+ 16
+
+
+
+ ProjectExplorer.Project.TargetCount
+ 1
+
+
+ ProjectExplorer.Project.Updater.FileVersion
+ 22
+
+
+ Version
+ 22
+
+
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
new file mode 100644
index 000000000..45f426a2f
--- /dev/null
+++ b/ggml-vulkan.cpp
@@ -0,0 +1,151 @@
+#include "ggml-vulkan.h"
+#include "ggml.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+typedef ggml_fp16_t half;
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+#define STRINGIFY(x) STRINGIFY2(x)
+#define STRINGIFY2(x) #x
+
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+
+typedef struct {
+ half d;
+ uint8_t qs[QK4_0 / 2];
+} block_q4_0;
+
+typedef struct {
+ half d;
+ half m;
+ uint8_t qs[QK4_1 / 2];
+} block_q4_1;
+
+
+kp::Manager mgr;
+
+
+
+static const std::string program_source_head = R"(
+#version 450
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: enable
+#extension GL_EXT_shader_explicit_arithmetic_types_int8: enable
+#define QK4_0 32
+#define QR4_0 2
+#define QK4_1 32
+layout (local_size_x = 1) in;
+)";
+
+
+static const std::string kernel_dequantize_row_q4_0 =
+ program_source_head+'\n'+MULTILINE_QUOTE(
+// Tensors
+layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
+layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
+layout(binding = 2) buffer tensorY { float y[]; };
+
+// Push constants
+layout(push_constant) uniform PushConstants {
+ int k;
+} pcs;
+
+void main() {
+ const int qk = QK4_0;
+
+ const int i = int(gl_GlobalInvocationID.x);
+ const int j = int(gl_GlobalInvocationID.y);
+
+ const float16_t d = x_d[i];
+ const uint8_t qs = x_qs[i * (QK4_0 / 2) + j];
+
+ const int x0 = (qs & 0x0F) - 8;
+ const int x1 = (qs >> 4) - 8;
+
+ y[i*qk + j + 0 ] = float16_t(x0)*d;
+ y[i*qk + j + qk/2] = float16_t(x1)*d;
+}
+);
+
+
+std::vector compileSource(const std::string& source) {
+ //FIXME: Terrible solution!!!!
+ std::ofstream fileOut("tmp_kp_shader.comp");
+ fileOut << source;
+ fileOut.close();
+ if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str()))
+ throw std::runtime_error("Error running glslangValidator command");
+ std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary);
+ std::vector buffer;
+ buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {});
+ return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
+}
+
+void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
+ static const int qk = QK4_0;
+ static const unsigned nb = k / qk;
+ static const unsigned y_size = qk*2*nb;
+ const static auto spirv = compileSource(kernel_dequantize_row_q4_0);
+
+ const auto x = reinterpret_cast(x_);
+
+ auto getVecBlockQ4_0D = [] (const block_q4_0 *x) {
+ std::vector fres;
+ fres.reserve(nb);
+ for (unsigned it = 0; it != nb; it++) {
+ fres.push_back(x[it].d);
+ }
+ return fres;
+ };
+ auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) {
+ std::vector fres;
+ fres.resize(nb*(qk/2));
+ for (unsigned x_it = 0; x_it != nb; x_it++) {
+ for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) {
+ fres.push_back(x[x_it].qs[qs_it]);
+ }
+ }
+ return fres;
+ };
+
+ const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x));
+ const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x));
+ const auto tensorY = mgr.tensor(std::vector(y, y+y_size));
+
+ struct PushConsts {
+ int k;
+ } pushConsts {
+ k
+ };
+
+ mgr.sequence()
+ ->record({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY})
+ ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector{pushConsts})
+ ->record({tensorY})
+ ->eval();
+
+ std::memcpy(y, tensorY->data(), tensorY->size());
+}
+
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT::dataType()
+{
+ return TensorDataTypes::eFloat;
+}
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT::dataType()
+{
+ return TensorDataTypes::eUnsignedInt;
+}
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
new file mode 100644
index 000000000..34e6d46b3
--- /dev/null
+++ b/ggml-vulkan.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_vk_init(void);
+
+void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index 4319683f5..151b9eefb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -161,6 +161,8 @@ inline static void* ggml_aligned_malloc(size_t size) {
#endif
#elif defined(GGML_USE_OPENBLAS)
#include
+#elif defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
#elif defined(GGML_USE_CUBLAS)
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
@@ -1548,7 +1550,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void *
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
- .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0,
+ .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0,
.quantize_row_q = quantize_row_q4_0,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference,
.quantize_row_q_dot = quantize_row_q8_0,
diff --git a/kompute b/kompute
new file mode 160000
index 000000000..63567a72b
--- /dev/null
+++ b/kompute
@@ -0,0 +1 @@
+Subproject commit 63567a72be6b26f79da92becaffa7cd55f46642b