Added more functions from Metal

2023-06-23 11:50:30 +02:00 · 2023-06-23 11:50:30 +02:00 · 339bc36cdd
commit 339bc36cdd
parent 9cdaea9240
3 changed files with 222 additions and 6 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -5,8 +5,10 @@
 #include <vector>
 #include <string>
 #include <memory>
-#include <cstring>
+#include <unordered_map>
 #include <fstream>
+#include <exception>
+#include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>

@ -14,8 +16,6 @@
 #error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
 #endif

-typedef ggml_fp16_t half;
-
 #define MULTILINE_QUOTE(...) #__VA_ARGS__
 #define STRINGIFY(x) STRINGIFY2(x)
 #define STRINGIFY2(x) #x
@ -24,6 +24,10 @@ typedef ggml_fp16_t half;
 #define QR4_0 2
 #define QK4_1 32

+
+typedef ggml_fp16_t half;
+enum class byte : unsigned char {};
+
 typedef struct {
    half    d;
    uint8_t qs[QK4_0 / 2];
@ -35,12 +39,82 @@ typedef struct {
    uint8_t qs[QK4_1 / 2];
 } block_q4_1;

+struct ggml_kompute_context {
+    std::unordered_map<const char *, std::shared_ptr<kp::Tensor>> buffers;
+    std::unordered_map<struct ggml_tensor *, std::shared_ptr<kp::Tensor>> tensors;
+};
+

 kp::Manager mgr;


+ggml_kompute_context *ggml_vk_init() {
+    return new ggml_kompute_context;
+}

-std::vector<uint32_t> compileSource(const std::string& source) {
+void ggml_metal_free(struct ggml_kompute_context * ctx) {
+    delete ctx;
+}
+
+
+bool ggml_vk_add_buffer(
+      struct ggml_kompute_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size) {
+    try {
+        std::vector<byte> vec(max_size);
+        std::memcpy(vec.data(), data, std::max(size, max_size));
+        auto tensor = mgr.tensorT<byte>(vec);
+        ctx->buffers.emplace(name, std::move(tensor));
+    } catch (const std::exception & e) {
+        fprintf(stderr, "ggml_vk: failed to add buffer '%s': %s\n", name, e.what());
+        return false;
+    }
+    return true;
+}
+
+std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
+    auto res = ctx->buffers.find(name);
+    if (res == ctx->buffers.end()) return nullptr;
+    return res->second;
+}
+
+
+void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    if (t->backend != GGML_BACKEND_GPU) {
+        return;
+    }
+
+    auto data = t->data;
+    auto size = ggml_nbytes(t);
+
+    std::vector<byte> vec(size);
+    memcpy(vec.data(), data, size);
+
+    auto tensor = mgr.tensorT<byte>(vec);
+    mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
+    ctx->tensors.emplace(t, std::move(tensor));
+}
+
+void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+    if (t->backend != GGML_BACKEND_GPU) {
+        return;
+    }
+
+    auto data = t->data;
+    auto size = ggml_nbytes(t);
+
+    auto res = ctx->tensors.find(t);
+
+    auto tensor = res->second;
+    mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
+    memcpy(data, tensor->data<void>(), size);
+}
+
+
+static std::vector<uint32_t> compileSource(const std::string& source) {
    //FIXME: Terrible solution!!!!
    std::ofstream fileOut("tmp_kp_shader.comp");
    fileOut << source;
@ -53,6 +127,7 @@ std::vector<uint32_t> compileSource(const std::string& source) {
    return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
 }

+
 template<class T>
 std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
    std::vector<half> fres(nb);
@ -90,12 +165,12 @@ static const std::string program_source_head = R"(
 #define QK4_0 32
 #define QR4_0 2
 #define QK4_1 32
-layout (local_size_x = 1) in;
 )";


 static const std::string program_dequantize_row_q4_0 =
        program_source_head+'\n'+MULTILINE_QUOTE(
+layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
 layout(binding = 2) buffer tensorY { float y[]; };
@ -143,6 +218,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {

 static const std::string program_dequantize_row_q4_1 =
        program_source_head+'\n'+MULTILINE_QUOTE(
+layout(local_size_x = 1, local_size_y = 1) in;
 layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
 layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
 layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
@ -191,6 +267,55 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
 }


+static const std::string program_abmath =
+        program_source_head+'\n'+MULTILINE_QUOTE(
+layout(push_constant) uniform PushConstants {
+    uint inAOff;
+    uint inBOff;
+    uint outOff;
+} pcs;
+
+
+layout(local_size_x = 1) in;
+layout(binding = 0) buffer tensorInA { float inA[]; };
+layout(binding = 1) buffer tensorInB { float inB[]; };
+layout(binding = 2) buffer tensorout { float out[]; };
+
+
+void main() {
+    const int i = int(gl_GlobalInvocationID.x);
+
+    out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i];
+}
+);
+
+template<char mathOP>
+void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
+                 const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
+                 std::shared_ptr<kp::Tensor>& out, uint32_t outOff) {
+    const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath);
+
+    struct PushConstants {
+        uint32_t inAOff, inBOff, outOff;
+    } pushConsts {
+        inAOff, inBOff, outOff
+    };
+
+    mgr.sequence()
+            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
+}
+
+template <typename... Args>
+void ggml_vk_add(Args&&... args) {
+  return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void ggml_vk_mul(Args&&... args) {
+  return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
+}
+
+
 template<>
 kp::Tensor::TensorDataTypes
 kp::TensorT<half>::dataType()
@ -204,3 +329,10 @@ kp::TensorT<uint8_t>::dataType()
 {
    return TensorDataTypes::eUnsignedInt;
 }
+
+template<>
+kp::Tensor::TensorDataTypes
+kp::TensorT<byte>::dataType()
+{
+    return TensorDataTypes::eUnsignedInt;
+}
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -1,12 +1,36 @@
 #pragma once

+#include <cstddef>
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

-void ggml_vk_init(void);
+struct ggml_kompute_context;
+
+
+ggml_kompute_context * ggml_vk_init(void);
+void ggml_metal_free(struct ggml_kompute_context * ctx);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Vulkan
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_vk_add_buffer(
+      struct ggml_kompute_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);

 void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);

 #ifdef  __cplusplus
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -14,6 +14,8 @@
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
 #endif

 #ifdef GGML_USE_METAL
@ -280,6 +282,8 @@ struct llama_context {

 #ifdef GGML_USE_METAL
    ggml_metal_context * ctx_metal = NULL;
+#elif defined(GGML_USE_KOMPUTE)
+    ggml_kompute_context * ctx_kompute = NULL;
 #endif

    int    buf_last = 0;
@ -1701,6 +1705,26 @@ static bool llama_eval_internal(
            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
        }

+        ggml_graph_compute(ctx0, &gf);
+    }
+#elif defined(GGML_USE_KOMPUTE)
+    if (lctx.ctx_kompute && N == 1) {
+        ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
+        ggml_vk_get_tensor   (lctx.ctx_kompute, cur);
+    } else {
+        // IMPORTANT:
+        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
+        // ggml_graph_compute().
+        //
+        // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
+        // But for now, we have focused only on Matrix x Vector Metal multiplication.
+        //
+        if (lctx.ctx_kompute) {
+            // We need to sync the GPU KV cache with the CPU KV cache
+            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
+        }
+
        ggml_graph_compute(ctx0, &gf);
    }
 #else
@ -2743,6 +2767,42 @@ struct llama_context * llama_init_from_file(
        LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
 #undef LLAMA_METAL_CHECK_BUF
    }
+#elif defined(GGML_USE_KOMPUTE)
+    if (params.n_gpu_layers > 0) {
+        // this allocates all Metal resources and memory buffers
+        ctx->ctx_kompute = ggml_vk_init();
+
+        void * data_ptr  = NULL;
+        size_t data_size = 0;
+
+        if (params.use_mmap) {
+            data_ptr  = ctx->model.mapping->addr;
+            data_size = ctx->model.mapping->size;
+        } else {
+            data_ptr  = ggml_get_mem_buffer(ctx->model.ctx);
+            data_size = ggml_get_mem_size  (ctx->model.ctx);
+        }
+
+        const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
+
+        printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
+
+#define LLAMA_METAL_CHECK_BUF(result)                                          \
+    if (!(result)) {                                                           \
+        fprintf(stderr, "%s: failed to add buffer\n", __func__);               \
+        llama_free(ctx);                                                       \
+        return NULL;                                                           \
+    }
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size));
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size,       0));
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
+
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
+        LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
+#undef LLAMA_METAL_CHECK_BUF
+    }
 #endif

    return ctx;