From 9b1f955083761d473832a5c8067549efe59577a3 Mon Sep 17 00:00:00 2001
From: JohannesGaessler <johannesg@5d6.de>
Date: Mon, 15 May 2023 11:27:32 +0200
Subject: [PATCH] cuBLAS doc + error if -ngl > 0 and no cuBLAS

---
 README.md | 4 +++-
 llama.cpp | 5 ++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 1d84a5e6d..88bf33ff3 100644
--- a/README.md
+++ b/README.md
@@ -278,7 +278,7 @@ Building the program with BLAS support may lead to some performance improvements
 
 - cuBLAS
 
-  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. It also enables GPU accelerated token generation via llama.cpp CUDA kernels. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
   - Using `make`:
     ```bash
     make LLAMA_CUBLAS=1
@@ -292,6 +292,8 @@ Building the program with BLAS support may lead to some performance improvements
     cmake --build . --config Release
     ```
 
+Prompt processing will automatically be GPU accelerated. To enable token generation acceleration use the `-ngl` or `--n-gpu-layers` argument and specify how many layers should be offloaded to the GPU. A higher value will enable more GPU acceleration but also increase VRAM usage. Maximum effective values: 33 for 7b, 41 for 13b, 61 for 33b, 81 for 65b. Multi-GPU setups and iGPUs are currently not supported.
+
 Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
 
 ### Prepare Data & Run
diff --git a/llama.cpp b/llama.cpp
index 98f49abd7..48d787a9a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1054,7 +1054,10 @@ static void llama_model_load_internal(
         fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
     }
 #else
-    (void) n_gpu_layers;
+    if (n_gpu_layers > 0) {
+        throw format("llama.cpp was compiled without cuBLAS. "
+            "It is not possible to offload the requested %d layers onto the GPU.\n", n_gpu_layers);
+    }
 #endif
 
     // loading time will be recalculate after the first eval, so