Tensor parallelism
This commit is contained in:
parent
971920e935
commit
4f9640b8fe
10 changed files with 598 additions and 411 deletions
|
@ -9,6 +9,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
@ -295,6 +296,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
|
// split string by , and /
|
||||||
|
const std::regex regex{R"([,/]+)"};
|
||||||
|
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||||
|
std::vector<std::string> split_arg{it, {}};
|
||||||
|
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
|
||||||
|
if (i < split_arg.size()) {
|
||||||
|
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||||
|
} else {
|
||||||
|
params.tensor_split[i] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--mtest") {
|
} else if (arg == "--mtest") {
|
||||||
|
@ -438,6 +463,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||||
|
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
@ -484,6 +511,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
|
||||||
|
|
||||||
lparams.n_ctx = params.n_ctx;
|
lparams.n_ctx = params.n_ctx;
|
||||||
lparams.n_gpu_layers = params.n_gpu_layers;
|
lparams.n_gpu_layers = params.n_gpu_layers;
|
||||||
|
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
|
|
@ -21,13 +21,14 @@
|
||||||
int32_t get_num_physical_cores();
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = get_num_physical_cores();
|
int32_t n_threads = get_num_physical_cores();
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 512; // context size
|
int32_t n_ctx = 512; // context size
|
||||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||||
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||||
|
|
|
@ -401,6 +401,8 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||||
|
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
|
||||||
|
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
#endif
|
#endif
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||||
|
@ -503,6 +505,37 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
else if (arg == "--tensor-split" || arg == "-ts")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
std::string arg_next = argv[i];
|
||||||
|
|
||||||
|
// split string by , and /
|
||||||
|
const std::regex regex{R"([,/]+)"};
|
||||||
|
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
|
||||||
|
std::vector<std::string> split_arg{it, {}};
|
||||||
|
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
|
||||||
|
{
|
||||||
|
if (i < split_arg.size())
|
||||||
|
{
|
||||||
|
params.tensor_split[i] = std::stof(split_arg[i]);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
params.tensor_split[i] = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
770
ggml-cuda.cu
770
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
14
ggml-cuda.h
14
ggml-cuda.h
|
@ -1,10 +1,21 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
|
struct ggml_tensor_extra_gpu {
|
||||||
|
int layer; // which layer the tensor is on
|
||||||
|
int i_device; // which device the data is on
|
||||||
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||||
|
};
|
||||||
|
|
||||||
void ggml_init_cublas(void);
|
void ggml_init_cublas(void);
|
||||||
|
void ggml_cuda_set_tensor_split(float * tensor_split);
|
||||||
|
|
||||||
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
|
@ -15,7 +26,8 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||||
void * ggml_cuda_host_malloc(size_t size);
|
void * ggml_cuda_host_malloc(size_t size);
|
||||||
void ggml_cuda_host_free(void * ptr);
|
void ggml_cuda_host_free(void * ptr);
|
||||||
|
|
||||||
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
|
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset, int n_layer);
|
||||||
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -676,7 +676,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_CL);
|
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
|
@ -789,7 +789,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||||
d_X = (cl_mem) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||||
|
@ -800,7 +800,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
// copy data to device
|
// copy data to device
|
||||||
if (src0->backend != GGML_BACKEND_CL) {
|
if (src0->backend != GGML_BACKEND_GPU) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
}
|
}
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
||||||
|
@ -829,7 +829,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->backend != GGML_BACKEND_CL) {
|
if (src0->backend != GGML_BACKEND_GPU) {
|
||||||
ggml_cl_pool_free(d_X, x_size);
|
ggml_cl_pool_free(d_X, x_size);
|
||||||
}
|
}
|
||||||
ggml_cl_pool_free(d_Y, y_size);
|
ggml_cl_pool_free(d_Y, y_size);
|
||||||
|
@ -865,7 +865,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
size_t y_size;
|
size_t y_size;
|
||||||
size_t d_size;
|
size_t d_size;
|
||||||
cl_mem d_X;
|
cl_mem d_X;
|
||||||
if (src0->backend == GGML_BACKEND_CL) {
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
||||||
d_X = (cl_mem) src0->data;
|
d_X = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||||
|
@ -879,7 +879,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
if (src0->backend != GGML_BACKEND_CL) {
|
if (src0->backend != GGML_BACKEND_GPU) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -936,7 +936,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src0->backend != GGML_BACKEND_CL) {
|
if (src0->backend != GGML_BACKEND_GPU) {
|
||||||
ggml_cl_pool_free(d_X, x_size);
|
ggml_cl_pool_free(d_X, x_size);
|
||||||
}
|
}
|
||||||
ggml_cl_pool_free(d_Y, y_size);
|
ggml_cl_pool_free(d_Y, y_size);
|
||||||
|
@ -992,7 +992,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_CPU) {
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||||
} else if (src0->backend == GGML_BACKEND_CL) {
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
d_Q = (cl_mem) src0->data;
|
d_Q = (cl_mem) src0->data;
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -1077,7 +1077,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
dst->type == GGML_TYPE_F32 &&
|
dst->type == GGML_TYPE_F32 &&
|
||||||
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1156,7 +1156,7 @@ void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||||
CL_CHECK(clFinish(queue));
|
CL_CHECK(clFinish(queue));
|
||||||
|
|
||||||
tensor->data = dst;
|
tensor->data = dst;
|
||||||
tensor->backend = GGML_BACKEND_CL;
|
tensor->backend = GGML_BACKEND_GPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
void ggml_cl_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
|
||||||
|
|
14
ggml.c
14
ggml.c
|
@ -3722,6 +3722,12 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||||
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
return MAX(tensor->ne[3]*tensor->nb[3], (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return (nrows_split*tensor->ne[0]*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_blck_size(enum ggml_type type) {
|
int ggml_blck_size(enum ggml_type type) {
|
||||||
return GGML_BLCK_SIZE[type];
|
return GGML_BLCK_SIZE[type];
|
||||||
}
|
}
|
||||||
|
@ -4144,6 +4150,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
/*.perf_time_us =*/ 0,
|
/*.perf_time_us =*/ 0,
|
||||||
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
|
||||||
/*.name =*/ { 0 },
|
/*.name =*/ { 0 },
|
||||||
|
/*.extra =*/ NULL,
|
||||||
/*.pad =*/ { 0 },
|
/*.pad =*/ { 0 },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -8147,7 +8154,7 @@ static void ggml_compute_forward_mul_f32(
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
#ifdef GGML_USE_CLBLAST
|
#ifdef GGML_USE_CLBLAST
|
||||||
if (src1->backend == GGML_BACKEND_CL) {
|
if (src1->backend == GGML_BACKEND_GPU) {
|
||||||
if (ith == 0) {
|
if (ith == 0) {
|
||||||
ggml_cl_mul(src0, src1, dst);
|
ggml_cl_mul(src0, src1, dst);
|
||||||
}
|
}
|
||||||
|
@ -12884,8 +12891,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
GGML_ASSERT(params);
|
GGML_ASSERT(params);
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
bool used_cuda = ggml_cuda_compute_forward(params, tensor);
|
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
||||||
if (used_cuda) {
|
if (skip_cpu) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
@ -14196,7 +14203,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
|
||||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||||
// the threads are still spinning
|
// the threads are still spinning
|
||||||
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
|
16
ggml.h
16
ggml.h
|
@ -249,8 +249,8 @@ extern "C" {
|
||||||
|
|
||||||
enum ggml_backend {
|
enum ggml_backend {
|
||||||
GGML_BACKEND_CPU = 0,
|
GGML_BACKEND_CPU = 0,
|
||||||
GGML_BACKEND_CUDA = 1,
|
GGML_BACKEND_GPU = 10,
|
||||||
GGML_BACKEND_CL = 2,
|
GGML_BACKEND_GPU_SPLIT = 20,
|
||||||
};
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
|
@ -375,7 +375,9 @@ extern "C" {
|
||||||
|
|
||||||
char name[GGML_MAX_NAME];
|
char name[GGML_MAX_NAME];
|
||||||
|
|
||||||
char padding[16];
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
|
char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -424,6 +426,7 @@ extern "C" {
|
||||||
struct ggml_compute_params {
|
struct ggml_compute_params {
|
||||||
enum ggml_task_type type;
|
enum ggml_task_type type;
|
||||||
|
|
||||||
|
// ith = thread index, nth = number of threads
|
||||||
int ith, nth;
|
int ith, nth;
|
||||||
|
|
||||||
// work buffer for all threads
|
// work buffer for all threads
|
||||||
|
@ -442,9 +445,10 @@ extern "C" {
|
||||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
||||||
|
|
||||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||||
|
|
85
llama.cpp
85
llama.cpp
|
@ -199,6 +199,12 @@ struct llama_model {
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
||||||
|
ggml_cuda_free_data(tensors_by_name[i].second);
|
||||||
|
}
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -665,7 +671,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, int layer, ggml_backend backend) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||||
|
@ -676,10 +682,10 @@ struct llama_model_loader {
|
||||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_tensor_for(lt, backend);
|
return get_tensor_for(lt, layer, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, int layer, ggml_backend backend) {
|
||||||
struct ggml_tensor * tensor;
|
struct ggml_tensor * tensor;
|
||||||
if (lt.ne.size() == 2) {
|
if (lt.ne.size() == 2) {
|
||||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
||||||
|
@ -689,6 +695,17 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
ggml_set_name(tensor, lt.name.c_str());
|
ggml_set_name(tensor, lt.name.c_str());
|
||||||
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (backend == GGML_BACKEND_GPU || backend == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||||
|
extra->layer = layer;
|
||||||
|
tensor->extra = extra;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
(void) layer;
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
tensor->backend = backend;
|
tensor->backend = backend;
|
||||||
lt.ggml_tensor = tensor;
|
lt.ggml_tensor = tensor;
|
||||||
num_ggml_tensors_created++;
|
num_ggml_tensors_created++;
|
||||||
|
@ -842,6 +859,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
struct llama_context_params result = {
|
struct llama_context_params result = {
|
||||||
/*.n_ctx =*/ 512,
|
/*.n_ctx =*/ 512,
|
||||||
/*.gpu_layers =*/ 0,
|
/*.gpu_layers =*/ 0,
|
||||||
|
/*.tensor_split =*/ {0},
|
||||||
/*.seed =*/ -1,
|
/*.seed =*/ -1,
|
||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
|
@ -926,6 +944,7 @@ static void llama_model_load_internal(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
float * tensor_split,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1019,13 +1038,16 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
|
|
||||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CL
|
|
||||||
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
|
||||||
#else
|
#else
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
|
||||||
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// prepare memory for the weights
|
// prepare memory for the weights
|
||||||
|
@ -1037,45 +1059,46 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
ml->ggml_ctx = ctx;
|
ml->ggml_ctx = ctx;
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, -1, GGML_BACKEND_CPU);
|
||||||
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
|
model.norm = ml->get_tensor("norm.weight", {n_embd}, -1, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
// "output" tensor
|
// "output" tensor
|
||||||
{
|
{
|
||||||
ggml_backend backend_output;
|
ggml_backend backend_output;
|
||||||
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
if (n_gpu_layers > int(n_layer)) { // NOLINT
|
||||||
backend_output = LLAMA_BACKEND_OFFLOAD;
|
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||||
} else {
|
} else {
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
|
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, -1, backend_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
std::string layers_i = "layers." + std::to_string(i);
|
std::string layers_i = "layers." + std::to_string(i);
|
||||||
|
|
||||||
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
|
layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, i, backend);
|
||||||
|
|
||||||
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
|
layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, i, backend_split);
|
||||||
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
|
layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, i, backend_split);
|
||||||
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
|
layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, i, backend_split);
|
||||||
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
|
layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, i, backend_split);
|
||||||
|
|
||||||
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
|
layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, i, backend);
|
||||||
|
|
||||||
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend);
|
layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, i, backend_split);
|
||||||
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend);
|
layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, i, backend_split);
|
||||||
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend);
|
layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, i, backend_split);
|
||||||
|
|
||||||
if (backend == LLAMA_BACKEND_OFFLOAD) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_total +=
|
vram_total +=
|
||||||
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
|
||||||
|
@ -1127,6 +1150,8 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
{
|
{
|
||||||
|
ggml_cuda_set_tensor_split(tensor_split);
|
||||||
|
|
||||||
size_t done_size = 0;
|
size_t done_size = 0;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
|
@ -1136,13 +1161,14 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
|
ggml_backend backend = lt.ggml_tensor->backend;
|
||||||
|
if (backend != GGML_BACKEND_GPU && backend != GGML_BACKEND_GPU_SPLIT) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
||||||
}
|
}
|
||||||
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
|
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off, hparams.n_layer);
|
||||||
done_size += lt.size;
|
done_size += lt.size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1157,7 +1183,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
||||||
if (lt.ggml_tensor->backend != GGML_BACKEND_CL) {
|
if (lt.ggml_tensor->backend != GGML_BACKEND_GPU) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
|
@ -1167,6 +1193,8 @@ static void llama_model_load_internal(
|
||||||
done_size += lt.size;
|
done_size += lt.size;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
(void) tensor_split;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
|
@ -1185,6 +1213,7 @@ static bool llama_model_load(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
|
float * tensor_split,
|
||||||
ggml_type memory_type,
|
ggml_type memory_type,
|
||||||
bool use_mmap,
|
bool use_mmap,
|
||||||
bool use_mlock,
|
bool use_mlock,
|
||||||
|
@ -1192,8 +1221,8 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, tensor_split, memory_type, use_mmap,
|
||||||
vocab_only, progress_callback, progress_callback_user_data);
|
use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::string & err) {
|
} catch (const std::string & err) {
|
||||||
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
fprintf(stderr, "error loading model: %s\n", err.c_str());
|
||||||
|
@ -2293,8 +2322,8 @@ struct llama_context * llama_init_from_file(
|
||||||
|
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, params.tensor_split,
|
||||||
params.use_mmap, params.use_mlock, params.vocab_only,
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
|
||||||
params.progress_callback, params.progress_callback_user_data)) {
|
params.progress_callback, params.progress_callback_user_data)) {
|
||||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
@ -2547,7 +2576,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
}
|
}
|
||||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, -1, GGML_BACKEND_CPU);
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||||
model_loader->load_data_for(lt);
|
model_loader->load_data_for(lt);
|
||||||
lt.ggml_tensor->data = lt.data;
|
lt.ggml_tensor->data = lt.data;
|
||||||
|
|
14
llama.h
14
llama.h
|
@ -1,6 +1,13 @@
|
||||||
#ifndef LLAMA_H
|
#ifndef LLAMA_H
|
||||||
#define LLAMA_H
|
#define LLAMA_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
#include "ggml-cuda.h"
|
||||||
|
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||||
|
#else
|
||||||
|
#define LLAMA_MAX_DEVICES 1
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
@ -65,9 +72,10 @@ extern "C" {
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
struct llama_context_params {
|
struct llama_context_params {
|
||||||
int n_ctx; // text context
|
int n_ctx; // text context
|
||||||
int n_gpu_layers; // number of layers to store in VRAM
|
int n_gpu_layers; // number of layers to store in VRAM
|
||||||
int seed; // RNG seed, -1 for random
|
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
|
||||||
|
int seed; // RNG seed, -1 for random
|
||||||
|
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue