diff --git a/ggml-opencl-legacy.c b/ggml-opencl-legacy.c index 50fef77df..d6ab46f88 100644 --- a/ggml-opencl-legacy.c +++ b/ggml-opencl-legacy.c @@ -164,6 +164,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f cl_int err_ = (err); \ if (err_ != CL_SUCCESS) { \ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ + fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \ exit(1); \ } \ } while (0) diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 21d053e71..763c791a0 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -313,6 +313,7 @@ static std::string generate_kernels() { cl_int err_ = (err); \ if (err_ != CL_SUCCESS) { \ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ + fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \ exit(1); \ } \ } while (0) @@ -840,6 +841,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { + printf("\nYou may be out of VRAM. Please check if you have enough.\n"); GGML_ASSERT(false); } } @@ -872,7 +874,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) { + ((GetQuantsUnshuffled() && GetGPULayers()>0 && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) { return true; } diff --git a/ggml.c b/ggml.c index f4ff09214..d4235545b 100644 --- a/ggml.c +++ b/ggml.c @@ -1499,6 +1499,7 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { return quantize_fns[i]; } +int true_gpu_layers = 0; bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl static inline quantize_fns_t get_quantize_fn(size_t i) diff --git a/ggml.h b/ggml.h index 2d9a6420c..b44e24ecc 100644 --- a/ggml.h +++ b/ggml.h @@ -1094,6 +1094,8 @@ extern "C" { void SetQuantsUnshuffled(bool unshuffled); bool GetQuantsUnshuffled(); + void SetGPULayers(bool layers); + bool GetGPULayers(); GGML_API int ggml_cpu_has_avx (void); GGML_API int ggml_cpu_has_avx2 (void); diff --git a/ggml_v2.c b/ggml_v2.c index de2afd66e..773b43733 100644 --- a/ggml_v2.c +++ b/ggml_v2.c @@ -1579,6 +1579,14 @@ inline bool GetQuantsUnshuffled() { return quants_unshuffled; } +inline void SetGPULayers(bool layers) +{ + true_gpu_layers = layers; +} +inline bool GetGPULayers() +{ + return true_gpu_layers; +} //TODO: integrate backwards compat static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = { diff --git a/llama.cpp b/llama.cpp index 8874af520..fddb51d82 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1073,7 +1073,8 @@ static void llama_model_load_internal( { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); if(GetQuantsUnshuffled()) - { + { + SetGPULayers(n_gpu); fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);