Began implementing ggml_graph_compute
This commit is contained in:
parent
b8a4594f89
commit
d539247996
3 changed files with 97 additions and 12 deletions
|
@ -8,6 +8,7 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <exception>
|
#include <exception>
|
||||||
|
#include <thread>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#include <kompute/Kompute.hpp>
|
#include <kompute/Kompute.hpp>
|
||||||
|
@ -75,6 +76,7 @@ bool ggml_vk_add_buffer(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
|
std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
|
||||||
auto res = ctx->buffers.find(name);
|
auto res = ctx->buffers.find(name);
|
||||||
if (res == ctx->buffers.end()) return nullptr;
|
if (res == ctx->buffers.end()) return nullptr;
|
||||||
|
@ -82,7 +84,7 @@ std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
|
void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
|
||||||
if (t->backend != GGML_BACKEND_GPU) {
|
if (t->backend != GGML_BACKEND_GPU) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -98,7 +100,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
|
||||||
ctx->tensors.emplace(t, std::move(tensor));
|
ctx->tensors.emplace(t, std::move(tensor));
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
|
void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
|
||||||
if (t->backend != GGML_BACKEND_GPU) {
|
if (t->backend != GGML_BACKEND_GPU) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -107,12 +109,23 @@ void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
|
||||||
auto size = ggml_nbytes(t);
|
auto size = ggml_nbytes(t);
|
||||||
|
|
||||||
auto res = ctx->tensors.find(t);
|
auto res = ctx->tensors.find(t);
|
||||||
|
assert(res != ctx->tensors.end());
|
||||||
|
|
||||||
auto tensor = res->second;
|
auto tensor = res->second;
|
||||||
mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
|
mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
|
||||||
memcpy(data, tensor->data<void>(), size);
|
memcpy(data, tensor->data<void>(), size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static
|
||||||
|
const std::shared_ptr<kp::Tensor> & ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
|
||||||
|
assert(t->backend != GGML_BACKEND_GPU);
|
||||||
|
|
||||||
|
auto res = ctx->tensors.find(t);
|
||||||
|
assert(res != ctx->tensors.end());
|
||||||
|
|
||||||
|
return res->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static std::vector<uint32_t> compileSource(const std::string& source) {
|
static std::vector<uint32_t> compileSource(const std::string& source) {
|
||||||
//FIXME: Terrible solution!!!!
|
//FIXME: Terrible solution!!!!
|
||||||
|
@ -302,7 +315,7 @@ void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
|
||||||
};
|
};
|
||||||
|
|
||||||
mgr.sequence()
|
mgr.sequence()
|
||||||
->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
|
->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size()-inAOff, inB->size()-inBOff)}, {}, {pushConsts}));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename... Args>
|
template <typename... Args>
|
||||||
|
@ -316,6 +329,78 @@ void ggml_vk_mul(Args&&... args) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) {
|
||||||
|
printf("%s: evaluating graph\n", __func__);
|
||||||
|
|
||||||
|
const int n_seq = gf->n_threads;
|
||||||
|
|
||||||
|
std::vector<kp::Sequence> sequences(n_seq);
|
||||||
|
|
||||||
|
std::vector<std::thread> threads(n_seq);
|
||||||
|
|
||||||
|
for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) {
|
||||||
|
const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq;
|
||||||
|
|
||||||
|
threads[seq_idx] = std::thread([&, seq_idx, n_nodes_per_seq] () {
|
||||||
|
size_t offs_src0 = 0;
|
||||||
|
size_t offs_src1 = 0;
|
||||||
|
size_t offs_dst = 0;
|
||||||
|
|
||||||
|
auto& seq = sequences[seq_idx];
|
||||||
|
|
||||||
|
const int node_start = (seq_idx + 0) * n_nodes_per_seq;
|
||||||
|
const int node_end = (seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq;
|
||||||
|
|
||||||
|
for (int i = node_start; i < node_end; ++i) {
|
||||||
|
printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
|
||||||
|
|
||||||
|
struct ggml_tensor * src0 = gf->nodes[i]->src0;
|
||||||
|
struct ggml_tensor * src1 = gf->nodes[i]->src1;
|
||||||
|
struct ggml_tensor * dst = gf->nodes[i];
|
||||||
|
|
||||||
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
||||||
|
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
||||||
|
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
||||||
|
const int64_t ne03 = src0 ? src0->ne[3] : 0;
|
||||||
|
|
||||||
|
const uint64_t nb00 = src0 ? src0->nb[0] : 0;
|
||||||
|
const uint64_t nb01 = src0 ? src0->nb[1] : 0;
|
||||||
|
const uint64_t nb02 = src0 ? src0->nb[2] : 0;
|
||||||
|
const uint64_t nb03 = src0 ? src0->nb[3] : 0;
|
||||||
|
|
||||||
|
const int64_t ne10 = src1 ? src1->ne[0] : 0;
|
||||||
|
const int64_t ne11 = src1 ? src1->ne[1] : 0;
|
||||||
|
const int64_t ne12 = src1 ? src1->ne[2] : 0;
|
||||||
|
const int64_t ne13 = src1 ? src1->ne[3] : 0; (void)ne13;
|
||||||
|
|
||||||
|
const uint64_t nb10 = src1 ? src1->nb[0] : 0;
|
||||||
|
const uint64_t nb11 = src1 ? src1->nb[1] : 0;
|
||||||
|
const uint64_t nb12 = src1 ? src1->nb[2] : 0;
|
||||||
|
const uint64_t nb13 = src1 ? src1->nb[3] : 0; (void)nb13;
|
||||||
|
|
||||||
|
const int64_t ne0 = dst ? dst->ne[0] : 0;
|
||||||
|
const int64_t ne1 = dst ? dst->ne[1] : 0;
|
||||||
|
const int64_t ne2 = dst ? dst->ne[2] : 0;
|
||||||
|
const int64_t ne3 = dst ? dst->ne[3] : 0;
|
||||||
|
|
||||||
|
const uint64_t nb0 = dst ? dst->nb[0] : 0;
|
||||||
|
const uint64_t nb1 = dst ? dst->nb[1] : 0;
|
||||||
|
const uint64_t nb2 = dst ? dst->nb[2] : 0;
|
||||||
|
const uint64_t nb3 = dst ? dst->nb[3] : 0;
|
||||||
|
|
||||||
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
|
||||||
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||||
|
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||||
|
|
||||||
|
std::shared_ptr<kp::Tensor> id_src0 = src0 ? ggml_vk_get_tensor(ctx, src0) : nullptr;
|
||||||
|
std::shared_ptr<kp::Tensor> id_src1 = src1 ? ggml_vk_get_tensor(ctx, src1) : nullptr;
|
||||||
|
std::shared_ptr<kp::Tensor> id_dst = dst ? ggml_vk_get_tensor(ctx, dst) : nullptr;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
template<>
|
template<>
|
||||||
kp::Tensor::TensorDataTypes
|
kp::Tensor::TensorDataTypes
|
||||||
kp::TensorT<half>::dataType()
|
kp::TensorT<half>::dataType()
|
||||||
|
|
|
@ -27,12 +27,12 @@ bool ggml_vk_add_buffer(
|
||||||
size_t size,
|
size_t size,
|
||||||
size_t max_size);
|
size_t max_size);
|
||||||
|
|
||||||
void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
||||||
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
||||||
|
|
||||||
void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
|
void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
|
||||||
void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
|
void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
|
||||||
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
|
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -1707,10 +1707,10 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_KOMPUTE_TODO)
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
if (lctx.ctx_kompute && N == 1) {
|
if (lctx.ctx_kompute && N == 1) {
|
||||||
ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
|
ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
|
||||||
ggml_vk_get_tensor (lctx.ctx_kompute, cur);
|
ggml_vk_d2h_tensor (lctx.ctx_kompute, cur);
|
||||||
} else {
|
} else {
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||||
|
@ -1721,8 +1721,8 @@ static bool llama_eval_internal(
|
||||||
//
|
//
|
||||||
if (lctx.ctx_kompute) {
|
if (lctx.ctx_kompute) {
|
||||||
// We need to sync the GPU KV cache with the CPU KV cache
|
// We need to sync the GPU KV cache with the CPU KV cache
|
||||||
ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
|
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.k);
|
||||||
ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
|
ggml_vk_d2h_tensor(lctx.ctx_kompute, kv_self.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue