Added a --low-vram option

2023-06-13 21:40:33 +02:00 · 2023-06-13 21:40:33 +02:00 · dba14529de
commit dba14529de
parent cc60183c5f
8 changed files with 63 additions and 18 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -331,6 +331,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
 #else
      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--low-vram" || arg == "-lv") {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@ -479,6 +485,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
@ -528,6 +535,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.main_gpu     = params.main_gpu;
    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
+    lparams.low_vram     = params.low_vram;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
    lparams.use_mmap     = params.use_mmap;
--- a/examples/common.h
+++ b/examples/common.h
@ -21,15 +21,16 @@
 int32_t get_num_physical_cores();

 struct gpt_params {
-    int32_t seed                           = -1;   // RNG seed
-    int32_t n_threads                      = get_num_physical_cores();
-    int32_t n_predict                      = -1;   // new tokens to predict
-    int32_t n_ctx                          = 512;  // context size
-    int32_t n_batch                        = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                         = 0;    // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers                   = 0;    // number of layers to store in VRAM
-    int32_t main_gpu                       = 0;    // the GPU that is used for scratch and small tensors
+    int32_t seed                            = -1;  // RNG seed
+    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_predict                       = -1;  // new tokens to predict
+    int32_t n_ctx                           = 512; // context size
+    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
+    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance

    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -289,6 +289,7 @@ Test();
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
 -   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
 -   `--port`: Set the port to listen. Default: `8080`.
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
  fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+  fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
 #endif
  fprintf(stderr, "  -m FNAME, --model FNAME\n");
  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
      }
 #else
      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+    }
+    else if (arg == "--low-vram" || arg == "-lv")
+    {
+#ifdef GGML_USE_CUBLAS
+      params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
 #endif // GGML_USE_CUBLAS
    }
    else if (arg == "--main-gpu" || arg == "-mg")
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2263,6 +2263,10 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
 }

 void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch) {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
    if (tensor->src0 != nullptr && tensor->src0->backend == GGML_BACKEND_CPU) {
        const ggml_op src0_op = tensor->src0->op;
        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW) {
--- a/llama.cpp
+++ b/llama.cpp
@ -919,6 +919,7 @@ struct llama_context_params llama_context_default_params() {
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
        /*.tensor_split                =*/ {0},
+        /*.low_vram                    =*/ false,
        /*.seed                        =*/ -1,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
@ -1027,6 +1028,7 @@ static void llama_model_load_internal(
        int n_gpu_layers,
        int main_gpu,
        const float * tensor_split,
+        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1226,8 +1228,13 @@ static void llama_model_load_internal(

        (void) vram_scratch;
 #ifdef GGML_USE_CUBLAS
-        vram_scratch = n_batch * MB;
-        ggml_cuda_set_scratch_size(vram_scratch);
+        if (low_vram) {
+            fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
+            ggml_cuda_set_scratch_size(0); // disable scratch
+        } else {
+            vram_scratch = n_batch * MB;
+            ggml_cuda_set_scratch_size(vram_scratch);
+        }
        if (n_gpu_layers > 0) {
            fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
                    __func__, vram_scratch / MB);
@ -1242,15 +1249,24 @@ static void llama_model_load_internal(
        }
        size_t vram_kv_cache = 0;
        if (n_gpu_layers > (int) hparams.n_layer + 1) {
-            fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
-            vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
+            if (low_vram) {
+                fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
+            } else {
+                fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
+                vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
+            }
        }
        if (n_gpu_layers > (int) hparams.n_layer + 2) {
-            fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
-            vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
+            if (low_vram) {
+                fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
+            } else {
+                fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
+                vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
+            }
        }
+        const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
        fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
-                __func__, std::min(n_gpu_layers, (int) hparams.n_layer + 3), hparams.n_layer + 3);
+                __func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
        fprintf(stderr, "%s: total VRAM used: %zu MB\n",
                __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
 #else
@ -1290,6 +1306,7 @@ static bool llama_model_load(
        int n_gpu_layers,
        int main_gpu,
        float * tensor_split,
+        bool low_vram,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1297,7 +1314,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type,
+        llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
                                  use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::exception & err) {
@ -1375,6 +1392,9 @@ static bool llama_eval_internal(

    // offload functions set the tensor output backend to GPU
    // tensors are GPU-accelerated if any input or the output has been offloaded
+    //
+    // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
+    // in that case ggml_cuda_assign_buffers has no effect
    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
    offload_func_t offload_func_kq = llama_nop;
    offload_func_t offload_func_v  = llama_nop;
@ -2608,8 +2628,8 @@ struct llama_context * llama_init_from_file(

    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
-                params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
+                params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
                params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
        llama_free(ctx);
--- a/llama.h
+++ b/llama.h
@ -77,6 +77,7 @@ extern "C" {
        int n_gpu_layers;                      // number of layers to store in VRAM
        int main_gpu;                          // the GPU that is used for scratch and small tensors
        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
        int seed;                              // RNG seed, -1 for random

        bool f16_kv;     // use fp16 for KV cache