Tensor parallelism

2023-05-24 14:29:21 +02:00 · 2023-05-24 14:29:21 +02:00 · 4f9640b8fe
commit 4f9640b8fe
parent 971920e935
10 changed files with 598 additions and 411 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -9,6 +9,7 @@
 #include <algorithm>
 #include <sstream>
 #include <unordered_set>
 #include <regex>
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
 #ifdef GGML_USE_CUBLAS
            std::string arg_next = argv[i];
            // split string by , and /
            const std::regex regex{R"([,/]+)"};
            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
            std::vector<std::string> split_arg{it, {}};
            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                if (i < split_arg.size()) {
                    params.tensor_split[i] = std::stof(split_arg[i]);
                } else {
                    params.tensor_split[i] = 0.0f;
                }
            }
 #else
      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
    fprintf(stderr, "                        number of layers to store in VRAM\n");
    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
 #endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    lparams.n_ctx        = params.n_ctx;
    lparams.n_gpu_layers = params.n_gpu_layers;
    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
    lparams.use_mmap     = params.use_mmap;
--- a/examples/common.h
+++ b/examples/common.h
@ -21,13 +21,14 @@
 int32_t get_num_physical_cores();
 struct gpt_params {
-    int32_t seed          = -1;  // RNG seed
+    int32_t seed                           = -1;  // RNG seed
-    int32_t n_threads     = get_num_physical_cores();
+    int32_t n_threads                      = get_num_physical_cores();
-    int32_t n_predict     = -1;  // new tokens to predict
+    int32_t n_predict                      = -1;  // new tokens to predict
-    int32_t n_ctx         = 512; // context size
+    int32_t n_ctx                          = 512; // context size
-    int32_t n_batch       = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_batch                        = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;   // number of tokens to keep from initial prompt
+    int32_t n_keep                         = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers  = 0;   // number of layers to store in VRAM
+    int32_t n_gpu_layers                   = 0;   // number of layers to store in VRAM
    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -401,6 +401,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
  fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
  fprintf(stderr, "                        number of layers to store in VRAM\n");
  fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
 #endif
  fprintf(stderr, "  -m FNAME, --model FNAME\n");
  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
@ -503,6 +505,37 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
      fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
      fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
    }
    else if (arg == "--tensor-split" || arg == "-ts")
    {
      if (++i >= argc)
      {
        invalid_param = true;
        break;
      }
 #ifdef GGML_USE_CUBLAS
      std::string arg_next = argv[i];
      // split string by , and /
      const std::regex regex{R"([,/]+)"};
      std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
      std::vector<std::string> split_arg{it, {}};
      GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
      for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
      {
        if (i < split_arg.size())
        {
          params.tensor_split[i] = std::stof(split_arg[i]);
        }
        else
        {
          params.tensor_split[i] = 0.0f;
        }
      }
 #else
      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
    }
    else
    {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -1,10 +1,21 @@
 #pragma once
 #include "ggml.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_CUDA_MAX_DEVICES       16
 struct ggml_tensor_extra_gpu {
    int layer; // which layer the tensor is on
    int i_device; // which device the data is on
    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
 };
 void   ggml_init_cublas(void);
 void ggml_cuda_set_tensor_split(float * tensor_split);
 void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
@ -15,7 +26,8 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
-void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
+void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset, int n_layer);
 void ggml_cuda_free_data(struct ggml_tensor * tensor);
 bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -676,7 +676,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
 }
 static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_CL);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
@ -789,7 +789,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t y_size;
    size_t d_size;
    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_CL) {
+    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
        d_X = (cl_mem) src0->data;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
@ -800,7 +800,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy data to device
-            if (src0->backend != GGML_BACKEND_CL) {
+            if (src0->backend != GGML_BACKEND_GPU) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
            }
            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
@ -829,7 +829,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
        }
    }
-    if (src0->backend != GGML_BACKEND_CL) {
+    if (src0->backend != GGML_BACKEND_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
    ggml_cl_pool_free(d_Y, y_size);
@ -865,7 +865,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t y_size;
    size_t d_size;
    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_CL) {
+    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
        d_X = (cl_mem) src0->data;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
@ -879,7 +879,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    for (int64_t i03 = 0; i03 < ne03; i03++) {
        for (int64_t i02 = 0; i02 < ne02; i02++) {
            // copy src0 to device
-            if (src0->backend != GGML_BACKEND_CL) {
+            if (src0->backend != GGML_BACKEND_GPU) {
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
            }
@ -936,7 +936,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
        }
    }
-    if (src0->backend != GGML_BACKEND_CL) {
+    if (src0->backend != GGML_BACKEND_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
    ggml_cl_pool_free(d_Y, y_size);
@ -992,7 +992,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
            if (src0->backend == GGML_BACKEND_CPU) {
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-            } else if (src0->backend == GGML_BACKEND_CL) {
+            } else if (src0->backend == GGML_BACKEND_GPU) {
                d_Q = (cl_mem) src0->data;
            } else {
                GGML_ASSERT(false);
@ -1077,7 +1077,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
        src1->type == GGML_TYPE_F32 &&
        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
        return true;
    }
@ -1156,7 +1156,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
    CL_CHECK(clFinish(queue));
    tensor->data = dst;
-    tensor->backend = GGML_BACKEND_CL;
+    tensor->backend = GGML_BACKEND_GPU;
 }
 void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
--- a/ggml.c
+++ b/ggml.c
@ -3722,6 +3722,12 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
    return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
 }
 size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
    return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
 }
 int ggml_blck_size(enum ggml_type type) {
    return GGML_BLCK_SIZE[type];
 }
@ -4144,6 +4150,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
        /*.perf_time_us =*/ 0,
        /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
        /*.name         =*/ { 0 },
        /*.extra        =*/ NULL,
        /*.pad          =*/ { 0 },
    };
@ -8147,7 +8154,7 @@ static void ggml_compute_forward_mul_f32(
    const int nth = params->nth;
 #ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_CL) {
+    if (src1->backend == GGML_BACKEND_GPU) {
        if (ith == 0) {
            ggml_cl_mul(src0, src1, dst);
        }
@ -12884,8 +12891,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
    GGML_ASSERT(params);
 #ifdef GGML_USE_CUBLAS
-    bool used_cuda = ggml_cuda_compute_forward(params, tensor);
+    bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
-    if (used_cuda) {
+    if (skip_cpu) {
        return;
    }
 #endif // GGML_USE_CUBLAS
@ -14196,7 +14203,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
                            node->n_tasks = 1; // TODO: this actually is doing nothing
                                                //       the threads are still spinning
                            cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                        }
                        else
 #elif defined(GGML_USE_CLBLAST)
--- a/ggml.h
+++ b/ggml.h
@ -249,8 +249,8 @@ extern "C" {
    enum ggml_backend {
        GGML_BACKEND_CPU = 0,
-        GGML_BACKEND_CUDA = 1,
+        GGML_BACKEND_GPU = 10,
-        GGML_BACKEND_CL = 2,
+        GGML_BACKEND_GPU_SPLIT = 20,
    };
    // model file types
@ -375,7 +375,9 @@ extern "C" {
        char name[GGML_MAX_NAME];
-        char padding[16];
+        void * extra; // extra things e.g. for ggml-cuda.cu
        char padding[4];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -424,6 +426,7 @@ extern "C" {
    struct ggml_compute_params {
        enum ggml_task_type type;
        // ith = thread index, nth = number of threads
        int ith, nth;
        // work buffer for all threads
@ -442,9 +445,10 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows    (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
    GGML_API int     ggml_blck_size (enum ggml_type type);
    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
--- a/llama.cpp
+++ b/llama.cpp
@ -199,6 +199,12 @@ struct llama_model {
        if (ctx) {
            ggml_free(ctx);
        }
 #ifdef GGML_USE_CUBLAS
        for (size_t i = 0; i < tensors_by_name.size(); ++i) {
            ggml_cuda_free_data(tensors_by_name[i].second);
        }
 #endif // GGML_USE_CUBLAS
    }
 };
@ -665,7 +671,7 @@ struct llama_model_loader {
        }
    }
-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, int layer, ggml_backend backend) {
        auto it = tensors_map.name_to_idx.find(name);
        if (it == tensors_map.name_to_idx.end()) {
            throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@ -676,10 +682,10 @@ struct llama_model_loader {
                         name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
        }
-        return get_tensor_for(lt, backend);
+        return get_tensor_for(lt, layer, backend);
    }
-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
+    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, int layer, ggml_backend backend) {
        struct ggml_tensor * tensor;
        if (lt.ne.size() == 2) {
            tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@ -689,6 +695,17 @@ struct llama_model_loader {
        }
        ggml_set_name(tensor, lt.name.c_str());
        LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
 #ifdef GGML_USE_CUBLAS
        if (backend == GGML_BACKEND_GPU || backend == GGML_BACKEND_GPU_SPLIT) {
            struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
            extra->layer = layer;
            tensor->extra = extra;
        }
 #else
        (void) layer;
 #endif // GGML_USE_CUBLAS
        tensor->backend = backend;
        lt.ggml_tensor = tensor;
        num_ggml_tensors_created++;
@ -842,6 +859,7 @@ struct llama_context_params llama_context_default_params() {
    struct llama_context_params result = {
        /*.n_ctx                       =*/ 512,
        /*.gpu_layers                  =*/ 0,
        /*.tensor_split                =*/ {0},
        /*.seed                        =*/ -1,
        /*.f16_kv                      =*/ true,
        /*.logits_all                  =*/ false,
@ -926,6 +944,7 @@ static void llama_model_load_internal(
        llama_context & lctx,
        int n_ctx,
        int n_gpu_layers,
        float * tensor_split,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1019,13 +1038,16 @@ static void llama_model_load_internal(
    }
 #if defined(GGML_USE_CUBLAS)
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
    fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
 #elif defined(GGML_USE_CLBLAST)
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
    fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
 #else
 #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
 #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
 #endif
    // prepare memory for the weights
@ -1037,45 +1059,46 @@ static void llama_model_load_internal(
        ml->ggml_ctx = ctx;
-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, -1, GGML_BACKEND_CPU);
-        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          -1, GGML_BACKEND_CPU);
        // "output" tensor
        {
            ggml_backend backend_output;
            if (n_gpu_layers > int(n_layer)) { // NOLINT
-                backend_output = LLAMA_BACKEND_OFFLOAD;
+                backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
            } else {
                backend_output = GGML_BACKEND_CPU;
            }
-            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
+            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, -1, backend_output);
        }
        const int i_gpu_start = n_layer - n_gpu_layers;
        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
-            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
            const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
            auto & layer = model.layers[i];
            std::string layers_i = "layers." + std::to_string(i);
-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, i, backend);
-            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, i, backend_split);
-            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
+            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, i, backend_split);
-            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
+            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, i, backend_split);
-            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
+            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, i, backend_split);
-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
+            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, i, backend);
-            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
+            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   i, backend_split);
-            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
+            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, i, backend_split);
-            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   backend);
+            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   i, backend_split);
-            if (backend == LLAMA_BACKEND_OFFLOAD) {
+            if (backend == GGML_BACKEND_GPU) {
                vram_total +=
                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
@ -1127,6 +1150,8 @@ static void llama_model_load_internal(
 #if defined(GGML_USE_CUBLAS)
    {
        ggml_cuda_set_tensor_split(tensor_split);
        size_t done_size = 0;
        size_t data_size = 0;
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
@ -1136,13 +1161,14 @@ static void llama_model_load_internal(
            }
        }
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-            if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+            ggml_backend backend = lt.ggml_tensor->backend;
            if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
                continue;
            }
            if (progress_callback) {
                progress_callback((float) done_size / data_size, progress_callback_user_data);
            }
-            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
+            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off, hparams.n_layer);
            done_size += lt.size;
        }
    }
@ -1157,7 +1183,7 @@ static void llama_model_load_internal(
            }
        }
        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-            if (lt.ggml_tensor->backend != GGML_BACKEND_CL) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
                continue;
            }
            if (progress_callback) {
@ -1167,6 +1193,8 @@ static void llama_model_load_internal(
            done_size += lt.size;
        }
    }
 #else
    (void) tensor_split;
 #endif
    if (progress_callback) {
@ -1185,6 +1213,7 @@ static bool llama_model_load(
        llama_context & lctx,
        int n_ctx,
        int n_gpu_layers,
        float * tensor_split,
        ggml_type memory_type,
        bool use_mmap,
        bool use_mlock,
@ -1192,8 +1221,8 @@ static bool llama_model_load(
        llama_progress_callback progress_callback,
        void *progress_callback_user_data) {
    try {
-        llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
+        llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, tensor_split, memory_type, use_mmap,
-                                  vocab_only, progress_callback, progress_callback_user_data);
+                                  use_mlock, vocab_only, progress_callback, progress_callback_user_data);
        return true;
    } catch (const std::string & err) {
        fprintf(stderr, "error loading model: %s\n", err.c_str());
@ -2293,8 +2322,8 @@ struct llama_context * llama_init_from_file(
    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
+    if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, params.tensor_split,
-                params.use_mmap, params.use_mlock, params.vocab_only,
+                memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
                params.progress_callback, params.progress_callback_user_data)) {
        fprintf(stderr, "%s: failed to load model\n", __func__);
        llama_free(ctx);
@ -2547,7 +2576,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                }
                size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, -1, GGML_BACKEND_CPU);
                lt.data = (uint8_t *) lt.ggml_tensor->data;
                model_loader->load_data_for(lt);
                lt.ggml_tensor->data = lt.data;
--- a/llama.h
+++ b/llama.h
@ -1,6 +1,13 @@
 #ifndef LLAMA_H
 #define LLAMA_H
 #include "ggml.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@ -65,9 +72,10 @@ extern "C" {
    typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
-        int n_ctx;        // text context
+        int n_ctx;                            // text context
-        int n_gpu_layers; // number of layers to store in VRAM
+        int n_gpu_layers;                     // number of layers to store in VRAM
-        int seed;         // RNG seed, -1 for random
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
        int seed;                             // RNG seed, -1 for random
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one