ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-05-13 15:38:36 +02:00 · 2023-05-13 15:38:36 +02:00 · 905d87b70a
commit 905d87b70a
parent f954edda93
8 changed files with 336 additions and 42 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -9,6 +9,9 @@
 #include "llama.h"

 #include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif

 #include <array>
 #include <ctime>
@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.n_ctx                       =*/ 512,
        /*.n_parts                     =*/ -1,
+        /*.gpu_layers                  =*/ 0,
        /*.seed                        =*/ -1,
        /*.f16_kv                      =*/ false,
        /*.logits_all                  =*/ false,
@ -876,6 +880,7 @@ static void llama_model_load_internal(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1022,6 +1027,33 @@ static void llama_model_load_internal(
    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);

    model.mapping = std::move(ml->mapping);
+#ifdef GGML_USE_CUBLAS
+    {
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
+        fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+
+        size_t vram_total = 0;
+
+        for (int i = 0; i < n_gpu; ++i) {
+            const auto & layer = model.layers[i];
+
+            ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
+            ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
+            ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
+            ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
+            ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
+            ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
+            ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+        }
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+            ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        }
+
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    }
+#endif

    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
@ -1032,6 +1064,7 @@ static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
        int n_ctx,
+        int n_gpu_layers,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1039,7 +1072,7 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
+        llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
                                  vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::string & err) {
@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file(

    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

-    if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
                          params.use_mmap, params.use_mlock, params.vocab_only,
                          params.progress_callback, params.progress_callback_user_data)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);