diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..4a068a698 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "kompute"] + path = kompute + url = https://github.com/KomputeProject/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index cc7560a7a..cae411109 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,6 +73,7 @@ set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) +option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_K_QUANTS "llama: use k-quants" ON) @@ -309,6 +310,22 @@ if (LLAMA_CLBLAST) endif() endif() +if (LLAMA_KOMPUTE) + if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") + message(STATUS "Kompute found") + + add_subdirectory(kompute) + + set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ggml-vulkan.h) + + add_compile_definitions(GGML_USE_KOMPUTE) + + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute) + else() + message(WARNING "Kompute not found") + endif() +endif() + if (LLAMA_ALL_WARNINGS) if (NOT MSVC) set(c_flags @@ -466,6 +483,7 @@ add_library(ggml OBJECT ggml.h ${GGML_SOURCES_CUDA} ${GGML_SOURCES_OPENCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_SOURCES_METAL} ${GGML_SOURCES_EXTRA} ) diff --git a/CMakeLists.txt.user b/CMakeLists.txt.user new file mode 100644 index 000000000..e7d373827 --- /dev/null +++ b/CMakeLists.txt.user @@ -0,0 +1,454 @@ + + + + + + EnvironmentId + {f3929b0b-3d39-4fa3-8d2d-2b329b63b30c} + + + ProjectExplorer.Project.ActiveTarget + 0 + + + ProjectExplorer.Project.EditorSettings + + true + false + true + + Cpp + + CppGlobal + + + + QmlJS + + QmlJSGlobal + + + 2 + UTF-8 + false + 4 + false + 80 + true + true + 1 + false + true + false + 0 + true + true + 0 + 8 + true + false + 1 + true + true + true + *.md, *.MD, Makefile + false + true + true + + + + ProjectExplorer.Project.PluginSettings + + + true + false + true + true + true + true + + + 0 + true + + true + true + Builtin.DefaultTidyAndClazy + 6 + + + + true + + + true + + + + + ProjectExplorer.Project.Target.0 + + Desktop + Clang + Clang + {913660d6-ca1c-4b66-a4da-64108a3258a2} + 0 + 0 + 0 + + Release + false + + -DCMAKE_GENERATOR:STRING=Unix Makefiles +-DCMAKE_BUILD_TYPE:STRING=Release +-DQT_QMAKE_EXECUTABLE:STRING=%{Qt:qmakeExecutable} +-DCMAKE_PREFIX_PATH:STRING=%{Qt:QT_INSTALL_PREFIX} +-DCMAKE_C_COMPILER:STRING=%{Compiler:Executable:C} +-DCMAKE_CXX_COMPILER:STRING=%{Compiler:Executable:Cxx} +-DCMAKE_CXX_FLAGS_INIT:STRING=%{Qt:QML_DEBUG_FLAG} + /mnt/hhdd/Programme/OSS/llama.cpp/../build-llama.cpp-Clang-Release + + + + + all + + false + + true + Build + CMakeProjectManager.MakeStep + + 1 + Build + Build + ProjectExplorer.BuildSteps.Build + + + + + + clean + + false + + true + Build + CMakeProjectManager.MakeStep + + 1 + Clean + Clean + ProjectExplorer.BuildSteps.Clean + + 2 + false + + false + + Release + CMakeProjectManager.CMakeBuildConfiguration + + 1 + + + 0 + Deploy + Deploy + ProjectExplorer.BuildSteps.Deploy + + 1 + + false + ProjectExplorer.DefaultDeployConfiguration + + 1 + + true + true + true + + 2 + + baby-llama + CMakeProjectManager.CMakeRunConfiguration.baby-llama + baby-llama + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + benchmark + CMakeProjectManager.CMakeRunConfiguration.benchmark + benchmark + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-quantize-perf + CMakeProjectManager.CMakeRunConfiguration.test-quantize-perf + test-quantize-perf + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-sampling + CMakeProjectManager.CMakeRunConfiguration.test-sampling + test-sampling + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-tokenizer-0 + CMakeProjectManager.CMakeRunConfiguration.test-tokenizer-0 + test-tokenizer-0 + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + train-text-from-scratch + CMakeProjectManager.CMakeRunConfiguration.train-text-from-scratch + train-text-from-scratch + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + vdot + CMakeProjectManager.CMakeRunConfiguration.vdot + vdot + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + simple + CMakeProjectManager.CMakeRunConfiguration.simple + simple + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + embedding + CMakeProjectManager.CMakeRunConfiguration.embedding + embedding + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + main + CMakeProjectManager.CMakeRunConfiguration.main + main + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + perplexity + CMakeProjectManager.CMakeRunConfiguration.perplexity + perplexity + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + q8dot + CMakeProjectManager.CMakeRunConfiguration.q8dot + q8dot + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + quantize + CMakeProjectManager.CMakeRunConfiguration.quantize + quantize + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + quantize-stats + CMakeProjectManager.CMakeRunConfiguration.quantize-stats + quantize-stats + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + save-load-state + CMakeProjectManager.CMakeRunConfiguration.save-load-state + save-load-state + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + + true + true + true + + 2 + + test-quantize-fns + CMakeProjectManager.CMakeRunConfiguration.test-quantize-fns + test-quantize-fns + false + true + true + false + true + /mnt/hhdd/Programme/OSS/build-llama.cpp-Clang-Release/bin + + 16 + + + + ProjectExplorer.Project.TargetCount + 1 + + + ProjectExplorer.Project.Updater.FileVersion + 22 + + + Version + 22 + + diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp new file mode 100644 index 000000000..45f426a2f --- /dev/null +++ b/ggml-vulkan.cpp @@ -0,0 +1,151 @@ +#include "ggml-vulkan.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include + +typedef ggml_fp16_t half; + +#define MULTILINE_QUOTE(...) #__VA_ARGS__ +#define STRINGIFY(x) STRINGIFY2(x) +#define STRINGIFY2(x) #x + +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 + +typedef struct { + half d; + uint8_t qs[QK4_0 / 2]; +} block_q4_0; + +typedef struct { + half d; + half m; + uint8_t qs[QK4_1 / 2]; +} block_q4_1; + + +kp::Manager mgr; + + + +static const std::string program_source_head = R"( +#version 450 +#extension GL_EXT_shader_explicit_arithmetic_types_float16: enable +#extension GL_EXT_shader_explicit_arithmetic_types_int8: enable +#define QK4_0 32 +#define QR4_0 2 +#define QK4_1 32 +layout (local_size_x = 1) in; +)"; + + +static const std::string kernel_dequantize_row_q4_0 = + program_source_head+'\n'+MULTILINE_QUOTE( +// Tensors +layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; }; +layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; }; +layout(binding = 2) buffer tensorY { float y[]; }; + +// Push constants +layout(push_constant) uniform PushConstants { + int k; +} pcs; + +void main() { + const int qk = QK4_0; + + const int i = int(gl_GlobalInvocationID.x); + const int j = int(gl_GlobalInvocationID.y); + + const float16_t d = x_d[i]; + const uint8_t qs = x_qs[i * (QK4_0 / 2) + j]; + + const int x0 = (qs & 0x0F) - 8; + const int x1 = (qs >> 4) - 8; + + y[i*qk + j + 0 ] = float16_t(x0)*d; + y[i*qk + j + qk/2] = float16_t(x1)*d; +} +); + + +std::vector compileSource(const std::string& source) { + //FIXME: Terrible solution!!!! + std::ofstream fileOut("tmp_kp_shader.comp"); + fileOut << source; + fileOut.close(); + if (system(std::string("glslangValidator -V tmp_kp_shader.comp -o tmp_kp_shader.comp.spv").c_str())) + throw std::runtime_error("Error running glslangValidator command"); + std::ifstream fileStream("tmp_kp_shader.comp.spv", std::ios::binary); + std::vector buffer; + buffer.insert(buffer.begin(), std::istreambuf_iterator(fileStream), {}); + return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())}; +} + +void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) { + static const int qk = QK4_0; + static const unsigned nb = k / qk; + static const unsigned y_size = qk*2*nb; + const static auto spirv = compileSource(kernel_dequantize_row_q4_0); + + const auto x = reinterpret_cast(x_); + + auto getVecBlockQ4_0D = [] (const block_q4_0 *x) { + std::vector fres; + fres.reserve(nb); + for (unsigned it = 0; it != nb; it++) { + fres.push_back(x[it].d); + } + return fres; + }; + auto getVecBlockQ4_0QS = [] (const block_q4_0 *x) { + std::vector fres; + fres.resize(nb*(qk/2)); + for (unsigned x_it = 0; x_it != nb; x_it++) { + for (unsigned qs_it = 0; qs_it != qk / 2; qs_it++) { + fres.push_back(x[x_it].qs[qs_it]); + } + } + return fres; + }; + + const auto tensorBlockQ4_0D = mgr.tensorT(getVecBlockQ4_0D(x)); + const auto tensorBlockQ4_0QS = mgr.tensorT(getVecBlockQ4_0QS(x)); + const auto tensorY = mgr.tensor(std::vector(y, y+y_size)); + + struct PushConsts { + int k; + } pushConsts { + k + }; + + mgr.sequence() + ->record({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}) + ->record(mgr.algorithm({tensorBlockQ4_0D, tensorBlockQ4_0QS, tensorY}, spirv, {nb, qk/2, 0}, {}, {0}), std::vector{pushConsts}) + ->record({tensorY}) + ->eval(); + + std::memcpy(y, tensorY->data(), tensorY->size()); +} + + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eFloat; +} + +template<> +kp::Tensor::TensorDataTypes +kp::TensorT::dataType() +{ + return TensorDataTypes::eUnsignedInt; +} diff --git a/ggml-vulkan.h b/ggml-vulkan.h new file mode 100644 index 000000000..34e6d46b3 --- /dev/null +++ b/ggml-vulkan.h @@ -0,0 +1,13 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_vk_init(void); + +void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); + +#ifdef __cplusplus +} +#endif diff --git a/ggml.c b/ggml.c index 4319683f5..151b9eefb 100644 --- a/ggml.c +++ b/ggml.c @@ -161,6 +161,8 @@ inline static void* ggml_aligned_malloc(size_t size) { #endif #elif defined(GGML_USE_OPENBLAS) #include +#elif defined(GGML_USE_KOMPUTE) +#include "ggml-vulkan.h" #elif defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) @@ -1548,7 +1550,7 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { - .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, + .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, .quantize_row_q_dot = quantize_row_q8_0, diff --git a/kompute b/kompute new file mode 160000 index 000000000..63567a72b --- /dev/null +++ b/kompute @@ -0,0 +1 @@ +Subproject commit 63567a72be6b26f79da92becaffa7cd55f46642b