More fixes...

This commit is contained in:
niansa 2023-06-23 12:19:33 +02:00
parent 9d643755a6
commit b8a4594f89
5 changed files with 8 additions and 7 deletions

View file

@ -64,8 +64,8 @@ bool ggml_vk_add_buffer(
size_t size, size_t size,
size_t max_size) { size_t max_size) {
try { try {
std::vector<byte> vec(max_size); std::vector<byte> vec(std::max(size, max_size));
std::memcpy(vec.data(), data, std::max(size, max_size)); std::memcpy(vec.data(), data, size);
auto tensor = mgr.tensorT<byte>(vec); auto tensor = mgr.tensorT<byte>(vec);
ctx->buffers.emplace(name, std::move(tensor)); ctx->buffers.emplace(name, std::move(tensor));
} catch (const std::exception & e) { } catch (const std::exception & e) {

View file

@ -31,6 +31,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t); void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k); void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph); void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
#ifdef __cplusplus #ifdef __cplusplus

2
ggml.c
View file

@ -1558,7 +1558,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_0, .vec_dot_type = GGML_TYPE_Q8_0,
}, },
[GGML_TYPE_Q4_1] = { [GGML_TYPE_Q4_1] = {
.dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1, .dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1,
.quantize_row_q = quantize_row_q4_1, .quantize_row_q = quantize_row_q4_1,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
.quantize_row_q_dot = quantize_row_q8_1, .quantize_row_q_dot = quantize_row_q8_1,

View file

@ -1269,7 +1269,7 @@ static void llama_model_load_internal(
} }
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_KOMPUTE)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@ -1707,7 +1707,7 @@ static bool llama_eval_internal(
ggml_graph_compute(ctx0, &gf); ggml_graph_compute(ctx0, &gf);
} }
#elif defined(GGML_USE_KOMPUTE) #elif defined(GGML_USE_KOMPUTE_TODO)
if (lctx.ctx_kompute && N == 1) { if (lctx.ctx_kompute && N == 1) {
ggml_vk_graph_compute(lctx.ctx_kompute, &gf); ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
ggml_vk_get_tensor (lctx.ctx_kompute, cur); ggml_vk_get_tensor (lctx.ctx_kompute, cur);

View file

@ -38,7 +38,7 @@
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 1 #define LLAMA_SESSION_VERSION 1
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU. // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
#define LLAMA_SUPPORTS_GPU_OFFLOAD #define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif #endif