From b8a4594f8930a53a099d91b0d77c7dd6242ee2af Mon Sep 17 00:00:00 2001 From: niansa Date: Fri, 23 Jun 2023 12:19:33 +0200 Subject: [PATCH] More fixes... --- ggml-vulkan.cpp | 4 ++-- ggml-vulkan.h | 1 + ggml.c | 2 +- llama.cpp | 4 ++-- llama.h | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index b7e70e221..7879a5937 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -64,8 +64,8 @@ bool ggml_vk_add_buffer( size_t size, size_t max_size) { try { - std::vector vec(max_size); - std::memcpy(vec.data(), data, std::max(size, max_size)); + std::vector vec(std::max(size, max_size)); + std::memcpy(vec.data(), data, size); auto tensor = mgr.tensorT(vec); ctx->buffers.emplace(name, std::move(tensor)); } catch (const std::exception & e) { diff --git a/ggml-vulkan.h b/ggml-vulkan.h index a3bc781d7..b7f7371cb 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -31,6 +31,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); +void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k); void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); #ifdef __cplusplus diff --git a/ggml.c b/ggml.c index 151b9eefb..34f087000 100644 --- a/ggml.c +++ b/ggml.c @@ -1558,7 +1558,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, }, [GGML_TYPE_Q4_1] = { - .dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, + .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1, .quantize_row_q = quantize_row_q4_1, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_dot = quantize_row_q8_1, diff --git a/llama.cpp b/llama.cpp index 824ed6121..85acd4e05 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1269,7 +1269,7 @@ static void llama_model_load_internal( } } #endif // GGML_USE_CUBLAS -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_KOMPUTE) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -1707,7 +1707,7 @@ static bool llama_eval_internal( ggml_graph_compute(ctx0, &gf); } -#elif defined(GGML_USE_KOMPUTE) +#elif defined(GGML_USE_KOMPUTE_TODO) if (lctx.ctx_kompute && N == 1) { ggml_vk_graph_compute(lctx.ctx_kompute, &gf); ggml_vk_get_tensor (lctx.ctx_kompute, cur); diff --git a/llama.h b/llama.h index 0de530d45..446dd49b9 100644 --- a/llama.h +++ b/llama.h @@ -38,7 +38,7 @@ #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_VERSION 1 -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) +#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. #define LLAMA_SUPPORTS_GPU_OFFLOAD #endif @@ -71,7 +71,7 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size