More fixes...
This commit is contained in:
parent
9d643755a6
commit
b8a4594f89
5 changed files with 8 additions and 7 deletions
|
@ -64,8 +64,8 @@ bool ggml_vk_add_buffer(
|
|||
size_t size,
|
||||
size_t max_size) {
|
||||
try {
|
||||
std::vector<byte> vec(max_size);
|
||||
std::memcpy(vec.data(), data, std::max(size, max_size));
|
||||
std::vector<byte> vec(std::max(size, max_size));
|
||||
std::memcpy(vec.data(), data, size);
|
||||
auto tensor = mgr.tensorT<byte>(vec);
|
||||
ctx->buffers.emplace(name, std::move(tensor));
|
||||
} catch (const std::exception & e) {
|
||||
|
|
|
@ -31,6 +31,7 @@ void ggml_vk_set_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor *
|
|||
void ggml_vk_get_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
|
||||
|
||||
void ggml_vk_dequantize_row_q4_0(const void * x, float * y, int k);
|
||||
void ggml_vk_dequantize_row_q4_1(const void * x, float * y, int k);
|
||||
void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * cgraph);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
2
ggml.c
2
ggml.c
|
@ -1558,7 +1558,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
|||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t)dequantize_row_q4_1,
|
||||
.dequantize_row_q = (dequantize_row_q_t) ggml_vk_dequantize_row_q4_1,
|
||||
.quantize_row_q = quantize_row_q4_1,
|
||||
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference,
|
||||
.quantize_row_q_dot = quantize_row_q8_1,
|
||||
|
|
|
@ -1269,7 +1269,7 @@ static void llama_model_load_internal(
|
|||
}
|
||||
}
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_KOMPUTE)
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
||||
|
@ -1707,7 +1707,7 @@ static bool llama_eval_internal(
|
|||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
}
|
||||
#elif defined(GGML_USE_KOMPUTE)
|
||||
#elif defined(GGML_USE_KOMPUTE_TODO)
|
||||
if (lctx.ctx_kompute && N == 1) {
|
||||
ggml_vk_graph_compute(lctx.ctx_kompute, &gf);
|
||||
ggml_vk_get_tensor (lctx.ctx_kompute, cur);
|
||||
|
|
2
llama.h
2
llama.h
|
@ -38,7 +38,7 @@
|
|||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 1
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_KOMPUTE)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue