diff --git a/ggml-opencl-legacy.c b/ggml-opencl-legacy.c
index 50fef77df..d6ab46f88 100644
--- a/ggml-opencl-legacy.c
+++ b/ggml-opencl-legacy.c
@@ -164,6 +164,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
         cl_int err_ = (err);                                                                    \
         if (err_ != CL_SUCCESS) {                                                               \
             fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");      \
             exit(1);                                                                            \
         }                                                                                       \
     } while (0)
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index 21d053e71..763c791a0 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -313,6 +313,7 @@ static std::string generate_kernels() {
         cl_int err_ = (err);                                                                    \
         if (err_ != CL_SUCCESS) {                                                               \
             fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n");      \
             exit(1);                                                                            \
         }                                                                                       \
     } while (0)
@@ -840,6 +841,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                                             &queue, &ev_sgemm);
 
                 if (status != clblast::StatusCode::kSuccess) {
+                    printf("\nYou may be out of VRAM. Please check if you have enough.\n");
                     GGML_ASSERT(false);
                 }
             }
@@ -872,7 +874,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
     if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
         src1->type == GGML_TYPE_F32 &&
         dst->type == GGML_TYPE_F32 &&
-        ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+        ((GetQuantsUnshuffled() && GetGPULayers()>0 && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
         return true;
     }
 
diff --git a/ggml.c b/ggml.c
index f4ff09214..d4235545b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1499,6 +1499,7 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
     return quantize_fns[i];
 }
 
+int true_gpu_layers = 0;
 bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
 static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl
 static inline quantize_fns_t get_quantize_fn(size_t i) 
diff --git a/ggml.h b/ggml.h
index 2d9a6420c..b44e24ecc 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1094,6 +1094,8 @@ extern "C" {
 
     void SetQuantsUnshuffled(bool unshuffled);
     bool GetQuantsUnshuffled();
+    void SetGPULayers(bool layers);
+    bool GetGPULayers();
 
     GGML_API int ggml_cpu_has_avx        (void);
     GGML_API int ggml_cpu_has_avx2       (void);
diff --git a/ggml_v2.c b/ggml_v2.c
index de2afd66e..773b43733 100644
--- a/ggml_v2.c
+++ b/ggml_v2.c
@@ -1579,6 +1579,14 @@ inline bool GetQuantsUnshuffled()
 {
     return quants_unshuffled;
 }
+inline void SetGPULayers(bool layers)
+{
+    true_gpu_layers = layers;
+}
+inline bool GetGPULayers()
+{
+    return true_gpu_layers;
+}
 
 //TODO: integrate backwards compat
 static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
diff --git a/llama.cpp b/llama.cpp
index 8874af520..fddb51d82 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1073,7 +1073,8 @@ static void llama_model_load_internal(
     {
         const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
         if(GetQuantsUnshuffled())
-        {       
+        {
+        SetGPULayers(n_gpu);
 
         fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);