From d53924799631f93f9207c7be511cda5e75b33066 Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 14:03:33 +0200 Subject: [PATCH] Began implementing ggml_graph_compute --- ggml-vulkan.cpp | 95 ++++++++++++++++++++++++++++++++++++++++++++++--- ggml-vulkan.h | 6 ++-- llama.cpp | 8 ++--- 3 files changed, 97 insertions(+), 12 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 7879a5937..d8cc9f1fa 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ bool ggml_vk_add_buffer( return true; } +static std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) { auto res = ctx->buffers.find(name); if (res == ctx->buffers.end()) return nullptr; @@ -82,7 +84,7 @@ std::shared_ptr ggml_vk_get_buffer(struct ggml_kompute_context * ctx } -void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { +void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { if (t->backend != GGML_BACKEND_GPU) { return; } @@ -98,7 +100,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * ctx->tensors.emplace(t, std::move(tensor)); } -void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { +void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { if (t->backend != GGML_BACKEND_GPU) { return; } @@ -107,12 +109,23 @@ void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * auto size = ggml_nbytes(t); auto res = ctx->tensors.find(t); + assert(res != ctx->tensors.end()); auto tensor = res->second; mgr.sequence()->eval({tensor}); memcpy(data, tensor->data(), size); } +static +const std::shared_ptr & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) { + assert(t->backend != GGML_BACKEND_GPU); + + auto res = ctx->tensors.find(t); + assert(res != ctx->tensors.end()); + + return res->second; +} + static std::vector compileSource(const std::string& source) { //FIXME: Terrible solution!!!! @@ -302,17 +315,89 @@ void ggml_vk_abmath(const std::shared_ptr& inA, uint32_t inAOff, }; mgr.sequence() - ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts})); + ->eval(mgr.algorithm({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts})); } template void ggml_vk_add(Args&&... args) { - return ggml_vk_abmath<'+'>(std::forward(args)...); + return ggml_vk_abmath<'+'>(std::forward(args)...); } template void ggml_vk_mul(Args&&... args) { - return ggml_vk_abmath<'*'>(std::forward(args)...); + return ggml_vk_abmath<'*'>(std::forward(args)...); +} + + +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { + printf("%s: evaluating graph\n", __func__); + + const int n_seq = gf->n_threads; + + std::vector sequences(n_seq); + + std::vector threads(n_seq); + + for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { + const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; + + threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () { + size_t offs_src0 = 0; + size_t offs_src1 = 0; + size_t offs_dst = 0; + + auto& seq = sequences[seq_idx]; + + const int node_start = (seq_idx + 0) * n_nodes_per_seq; + const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq; + + for (int i = node_start; i < node_end; ++i) { + printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + struct ggml_tensor * src0 = gf->nodes[i]->src0; + struct ggml_tensor * src1 = gf->nodes[i]->src1; + struct ggml_tensor * dst = gf->nodes[i]; + + const int64_t ne00 = src0 ? src0->ne[0] : 0; + const int64_t ne01 = src0 ? src0->ne[1] : 0; + const int64_t ne02 = src0 ? src0->ne[2] : 0; + const int64_t ne03 = src0 ? src0->ne[3] : 0; + + const uint64_t nb00 = src0 ? src0->nb[0] : 0; + const uint64_t nb01 = src0 ? src0->nb[1] : 0; + const uint64_t nb02 = src0 ? src0->nb[2] : 0; + const uint64_t nb03 = src0 ? src0->nb[3] : 0; + + const int64_t ne10 = src1 ? src1->ne[0] : 0; + const int64_t ne11 = src1 ? src1->ne[1] : 0; + const int64_t ne12 = src1 ? src1->ne[2] : 0; + const int64_t ne13 = src1 ? src1->ne[3] : 0; (void)ne13; + + const uint64_t nb10 = src1 ? src1->nb[0] : 0; + const uint64_t nb11 = src1 ? src1->nb[1] : 0; + const uint64_t nb12 = src1 ? src1->nb[2] : 0; + const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13; + + const int64_t ne0 = dst ? dst->ne[0] : 0; + const int64_t ne1 = dst ? dst->ne[1] : 0; + const int64_t ne2 = dst ? dst->ne[2] : 0; + const int64_t ne3 = dst ? dst->ne[3] : 0; + + const uint64_t nb0 = dst ? dst->nb[0] : 0; + const uint64_t nb1 = dst ? dst->nb[1] : 0; + const uint64_t nb2 = dst ? dst->nb[2] : 0; + const uint64_t nb3 = dst ? dst->nb[3] : 0; + + const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; + const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; + const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; + + std::shared_ptr id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr; + std::shared_ptr id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr; + std::shared_ptr id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullptr; + } + }); + } } diff --git a/ggml-vulkan.h b/ggml-vulkan.h index b7f7371cb..19aaec949 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -27,12 +27,12 @@ bool ggml_vk_add_buffer( size_t size, size_t max_size); -void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); -void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); +void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); +void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k); -void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); +void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index 85acd4e05..89c7fa656 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1707,10 +1707,10 @@ static bool llama_eval_internal( ggml_graph_compute(ctx0, &gf); } -#elif defined(GGML_USE_KOMPUTE_TODO) +#elif defined(GGML_USE_KOMPUTE) if (lctx.ctx_kompute && N == 1) { ggml_vk_graph_compute(lctx.ctx_kompute, &gf); - ggml_vk_get_tensor (lctx.ctx_kompute, cur); + ggml_vk_d2h_tensor (lctx.ctx_kompute, cur); } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1721,8 +1721,8 @@ static bool llama_eval_internal( // if (lctx.ctx_kompute) { // We need to sync the GPU KV cache with the CPU KV cache - ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k); - ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v); + ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k); + ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v); } ggml_graph_compute(ctx0, &gf);