Added more functions from Metal

This commit is contained in:
niansa 2023-06-23 11:50:30 +02:00
parent 9cdaea9240
commit 339bc36cdd
3 changed files with 222 additions and 6 deletions

View file

@ -5,8 +5,10 @@
#include <vector>
#include <string>
#include <memory>
#include <cstring>
#include <unordered_map>
#include <fstream>
#include <exception>
#include <cstring>
#include <immintrin.h>
#include <kompute/Kompute.hpp>
@ -14,8 +16,6 @@
#error Your C implementation is not IEC 559 compliant, which is required for proper Vulkan interop.
#endif
typedef ggml_fp16_t half;
#define MULTILINE_QUOTE(...) #__VA_ARGS__
#define STRINGIFY(x) STRINGIFY2(x)
#define STRINGIFY2(x) #x
@ -24,6 +24,10 @@ typedef ggml_fp16_t half;
#define QR4_0 2
#define QK4_1 32
typedef ggml_fp16_t half;
enum class byte : unsigned char {};
typedef struct {
half d;
uint8_t qs[QK4_0 / 2];
@ -35,12 +39,82 @@ typedef struct {
uint8_t qs[QK4_1 / 2];
} block_q4_1;
struct ggml_kompute_context {
std::unordered_map<const char *, std::shared_ptr<kp::Tensor>> buffers;
std::unordered_map<struct ggml_tensor *, std::shared_ptr<kp::Tensor>> tensors;
};
kp::Manager mgr;
ggml_kompute_context *ggml_vk_init() {
return new ggml_kompute_context;
}
std::vector<uint32_t> compileSource(const std::string& source) {
void ggml_metal_free(struct ggml_kompute_context * ctx) {
delete ctx;
}
bool ggml_vk_add_buffer(
struct ggml_kompute_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size) {
try {
std::vector<byte> vec(max_size);
std::memcpy(vec.data(), data, std::max(size, max_size));
auto tensor = mgr.tensorT<byte>(vec);
ctx->buffers.emplace(name, std::move(tensor));
} catch (const std::exception & e) {
fprintf(stderr, "ggml_vk: failed to add buffer '%s': %s\n", name, e.what());
return false;
}
return true;
}
std::shared_ptr<kp::Tensor> ggml_vk_get_buffer(struct ggml_kompute_context * ctx, const char * name) {
auto res = ctx->buffers.find(name);
if (res == ctx->buffers.end()) return nullptr;
return res->second;
}
void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
if (t->backend != GGML_BACKEND_GPU) {
return;
}
auto data = t->data;
auto size = ggml_nbytes(t);
std::vector<byte> vec(size);
memcpy(vec.data(), data, size);
auto tensor = mgr.tensorT<byte>(vec);
mgr.sequence()->eval<kp::OpTensorSyncDevice>({tensor});
ctx->tensors.emplace(t, std::move(tensor));
}
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t) {
if (t->backend != GGML_BACKEND_GPU) {
return;
}
auto data = t->data;
auto size = ggml_nbytes(t);
auto res = ctx->tensors.find(t);
auto tensor = res->second;
mgr.sequence()->eval<kp::OpTensorSyncLocal>({tensor});
memcpy(data, tensor->data<void>(), size);
}
static std::vector<uint32_t> compileSource(const std::string& source) {
//FIXME: Terrible solution!!!!
std::ofstream fileOut("tmp_kp_shader.comp");
fileOut << source;
@ -53,6 +127,7 @@ std::vector<uint32_t> compileSource(const std::string& source) {
return {(uint32_t*)buffer.data(), (uint32_t*)(buffer.data() + buffer.size())};
}
template<class T>
std::vector<half> getVecBlockQ4_0D(T *x, unsigned nb) {
std::vector<half> fres(nb);
@ -90,12 +165,12 @@ static const std::string program_source_head = R"(
#define QK4_0 32
#define QR4_0 2
#define QK4_1 32
layout (local_size_x = 1) in;
)";
static const std::string program_dequantize_row_q4_0 =
program_source_head+'\n'+MULTILINE_QUOTE(
layout(local_size_x = 1, local_size_y = 1) in;
layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
layout(binding = 1) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
layout(binding = 2) buffer tensorY { float y[]; };
@ -143,6 +218,7 @@ void ggml_vk_dequantize_row_q4_0(const void *x_, float *y, int k) {
static const std::string program_dequantize_row_q4_1 =
program_source_head+'\n'+MULTILINE_QUOTE(
layout(local_size_x = 1, local_size_y = 1) in;
layout(binding = 0) buffer tensorBlockQ4_0D { float16_t x_d[]; };
layout(binding = 1) buffer tensorBlockQ4_0M { float16_t x_m[]; };
layout(binding = 2) buffer tensorBlockQ4_0QS { uint8_t x_qs[]; };
@ -191,6 +267,55 @@ void ggml_vk_dequantize_row_q4_1(const void *x_, float *y, int k) {
}
static const std::string program_abmath =
program_source_head+'\n'+MULTILINE_QUOTE(
layout(push_constant) uniform PushConstants {
uint inAOff;
uint inBOff;
uint outOff;
} pcs;
layout(local_size_x = 1) in;
layout(binding = 0) buffer tensorInA { float inA[]; };
layout(binding = 1) buffer tensorInB { float inB[]; };
layout(binding = 2) buffer tensorout { float out[]; };
void main() {
const int i = int(gl_GlobalInvocationID.x);
out[pcs.outOff+i] = inA[pcs.inAOff+i] MATH_OP inB[pcs.inBOff+i];
}
);
template<char mathOP>
void ggml_vk_abmath(const std::shared_ptr<kp::Tensor>& inA, uint32_t inAOff,
const std::shared_ptr<kp::Tensor>& inB, uint32_t inBOff,
std::shared_ptr<kp::Tensor>& out, uint32_t outOff) {
const static auto spirv = compileSource("#define MATH_OP "+std::string(1, mathOP)+'\n'+program_abmath);
struct PushConstants {
uint32_t inAOff, inBOff, outOff;
} pushConsts {
inAOff, inBOff, outOff
};
mgr.sequence()
->eval<kp::OpAlgoDispatch>(mgr.algorithm<float, PushConstants>({inA, inB, out}, spirv, {std::min(inA->size(), inB->size())}, {}, {pushConsts}));
}
template <typename... Args>
void ggml_vk_add(Args&&... args) {
return ggml_vk_abmath<'+'>(std::forward<Args>(args)...);
}
template <typename... Args>
void ggml_vk_mul(Args&&... args) {
return ggml_vk_abmath<'*'>(std::forward<Args>(args)...);
}
template<>
kp::Tensor::TensorDataTypes
kp::TensorT<half>::dataType()
@ -204,3 +329,10 @@ kp::TensorT<uint8_t>::dataType()
{
return TensorDataTypes::eUnsignedInt;
}
template<>
kp::Tensor::TensorDataTypes
kp::TensorT<byte>::dataType()
{
return TensorDataTypes::eUnsignedInt;
}

View file

@ -1,12 +1,36 @@
#pragma once
#include <cstddef>
#ifdef __cplusplus
extern "C" {
#endif
void ggml_vk_init(void);
struct ggml_kompute_context;
ggml_kompute_context * ggml_vk_init(void);
void ggml_metal_free(struct ggml_kompute_context * ctx);
// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_vk_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Vulkan
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//
bool ggml_vk_add_buffer(
struct ggml_kompute_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);
void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
#ifdef __cplusplus
}

View file

@ -14,6 +14,8 @@
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_KOMPUTE)
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_METAL
@ -280,6 +282,8 @@ struct llama_context {
#ifdef GGML_USE_METAL
ggml_metal_context * ctx_metal = NULL;
#elif defined(GGML_USE_KOMPUTE)
ggml_kompute_context * ctx_kompute = NULL;
#endif
int buf_last = 0;
@ -1701,6 +1705,26 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
ggml_graph_compute(ctx0, &gf);
}
#elif defined(GGML_USE_KOMPUTE)
if (lctx.ctx_kompute && N == 1) {
ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
ggml_vk_get_tensor (lctx.ctx_kompute, cur);
} else {
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
// ggml_graph_compute().
//
// When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
// But for now, we have focused only on Matrix x Vector Metal multiplication.
//
if (lctx.ctx_kompute) {
// We need to sync the GPU KV cache with the CPU KV cache
ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.k);
ggml_vk_get_tensor(lctx.ctx_kompute, kv_self.v);
}
ggml_graph_compute(ctx0, &gf);
}
#else
@ -2743,6 +2767,42 @@ struct llama_context * llama_init_from_file(
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#elif defined(GGML_USE_KOMPUTE)
if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers
ctx->ctx_kompute = ggml_vk_init();
void * data_ptr = NULL;
size_t data_size = 0;
if (params.use_mmap) {
data_ptr = ctx->model.mapping->addr;
data_size = ctx->model.mapping->size;
} else {
data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
data_size = ggml_get_mem_size (ctx->model.ctx);
}
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
#define LLAMA_METAL_CHECK_BUF(result) \
if (!(result)) { \
fprintf(stderr, "%s: failed to add buffer\n", __func__); \
llama_free(ctx); \
return NULL; \
}
LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "data", data_ptr, data_size, max_size));
LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
LLAMA_METAL_CHECK_BUF(ggml_vk_add_buffer(ctx->ctx_kompute, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#endif
return ctx;