Began implementing ggml_graph_compute

2023-06-23 14:03:33 +02:00 · 2023-06-23 14:03:33 +02:00 · d539247996
commit d539247996
parent b8a4594f89
3 changed files with 97 additions and 12 deletions
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@ -8,6 +8,7 @@
 #include <unordered_map>
 #include <fstream>
 #include <exception>
 #include <thread>
 #include <cstring>
 #include <immintrin.h>
 #include <kompute/Kompute.hpp>
@ -75,6 +76,7 @@ bool ggml_vk_add_buffer(
    return true;
 }
 static
 std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
    auto res = ctx->buffers.find(name);
    if (res == ctx->buffers.end()) return nullptr;
@ -82,7 +84,7 @@ std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx
 }
-void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
    if (t->backend != GGML_BACKEND_GPU) {
        return;
    }
@ -98,7 +100,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
    ctx->tensors.emplace(t, std::move(tensor));
 }
-void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
    if (t->backend != GGML_BACKEND_GPU) {
        return;
    }
@ -107,12 +109,23 @@ void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
    auto size = ggml_nbytes(t);
    auto res = ctx->tensors.find(t);
    assert(res != ctx->tensors.end());
    auto tensor = res->second;
    mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
    memcpy(data, tensor->data<void>(), size);
 }
 static
 const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
    assert(t->backend != GGML_BACKEND_GPU);
    auto res = ctx->tensors.find(t);
    assert(res != ctx->tensors.end());
    return res->second;
 }
 static std::vector<uint32_t> compileSource(const std::string& source) {
    //FIXME: Terrible solution!!!!
@ -302,7 +315,7 @@ void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
    };
    mgr.sequence()
-            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
+            ->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
 }
 template <typename... Args>
@ -316,6 +329,78 @@ void ggml_vk_mul(Args&&... args) {
 }
 void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
    printf("%s: evaluating graph\n", __func__);
    const int n_seq = gf->n_threads;
    std::vector<kp::Sequence> sequences(n_seq);
    std::vector<std::thread> threads(n_seq);
    for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
        const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
        threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () {
            size_t offs_src0 = 0;
            size_t offs_src1 = 0;
            size_t offs_dst  = 0;
            auto& seq = sequences[seq_idx];
            const int node_start = (seq_idx + 0) * n_nodes_per_seq;
            const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
            for (int i = node_start; i < node_end; ++i) {
                printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
                struct ggml_tensor * src0 = gf->nodes[i]->src0;
                struct ggml_tensor * src1 = gf->nodes[i]->src1;
                struct ggml_tensor * dst = gf->nodes[i];
                const int64_t ne00 = src0 ? src0->ne[0] : 0;
                const int64_t ne01 = src0 ? src0->ne[1] : 0;
                const int64_t ne02 = src0 ? src0->ne[2] : 0;
                const int64_t ne03 = src0 ? src0->ne[3] : 0;
                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
                const int64_t ne10 = src1 ? src1->ne[0] : 0;
                const int64_t ne11 = src1 ? src1->ne[1] : 0;
                const int64_t ne12 = src1 ? src1->ne[2] : 0;
                const int64_t ne13 = src1 ? src1->ne[3] : 0;  (void)ne13;
                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
                const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13;
                const int64_t ne0 = dst ? dst->ne[0] : 0;
                const int64_t ne1 = dst ? dst->ne[1] : 0;
                const int64_t ne2 = dst ? dst->ne[2] : 0;
                const int64_t ne3 = dst ? dst->ne[3] : 0;
                const uint64_t nb0 = dst ? dst->nb[0] : 0;
                const uint64_t nb1 = dst ? dst->nb[1] : 0;
                const uint64_t nb2 = dst ? dst->nb[2] : 0;
                const uint64_t nb3 = dst ? dst->nb[3] : 0;
                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
                const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
                std::shared_ptr<kp::Tensor> id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr;
                std::shared_ptr<kp::Tensor> id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr;
                std::shared_ptr<kp::Tensor> id_dst  = dst  ? ggml_vk_get_tensor(ctx, dst)  : nullptr;
            }
        });
    }
 }
 template<>
 kp::Tensor::TensorDataTypes
 kp::TensorT<half>::dataType()
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -27,12 +27,12 @@ bool ggml_vk_add_buffer(
                           size_t   size,
                           size_t   max_size);
-void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
+void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
 void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
 void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
-void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
+void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
 #ifdef  __cplusplus
 }
--- a/llama.cpp
+++ b/llama.cpp
@ -1707,10 +1707,10 @@ static bool llama_eval_internal(
        ggml_graph_compute(ctx0, &gf);
    }
-#elif defined(GGML_USE_KOMPUTE_TODO)
+#elif defined(GGML_USE_KOMPUTE)
    if (lctx.ctx_kompute && N == 1) {
        ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
-        ggml_vk_get_tensor   (lctx.ctx_kompute, cur);
+        ggml_vk_d2h_tensor   (lctx.ctx_kompute, cur);
    } else {
        // IMPORTANT:
        // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1721,8 +1721,8 @@ static bool llama_eval_internal(
        //
        if (lctx.ctx_kompute) {
            // We need to sync the GPU KV cache with the CPU KV cache
-            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
-            ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
+            ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
        }
        ggml_graph_compute(ctx0, &gf);