llama : revert enum name changes from this PR
ggml-ci
This commit is contained in:
parent
5f5b1b57ca
commit
42ddf4846c
6 changed files with 55 additions and 55 deletions
|
@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string value(argv[i]);
|
std::string value(argv[i]);
|
||||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
else { invalid_param = true; break; }
|
else { invalid_param = true; break; }
|
||||||
} else if (arg == "--rope-scale") {
|
} else if (arg == "--rope-scale") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
if (arg_next == "none") {
|
if (arg_next == "none") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
params.split_mode = LLAMA_SPLIT_NONE;
|
||||||
} else if (arg_next == "layer") {
|
} else if (arg_next == "layer") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||||
} else if (arg_next == "row") {
|
} else if (arg_next == "row") {
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
params.split_mode = LLAMA_SPLIT_ROW;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -61,7 +61,7 @@ struct gpt_params {
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
|
||||||
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
|
@ -75,7 +75,7 @@ struct gpt_params {
|
||||||
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
float yarn_beta_fast = 32.0f; // YaRN low correction dim
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
|
|
|
@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) {
|
||||||
|
|
||||||
static const char * split_mode_str(llama_split_mode mode) {
|
static const char * split_mode_str(llama_split_mode mode) {
|
||||||
switch (mode) {
|
switch (mode) {
|
||||||
case LLAMA_SPLIT_MODE_NONE: return "none";
|
case LLAMA_SPLIT_NONE: return "none";
|
||||||
case LLAMA_SPLIT_MODE_LAYER: return "layer";
|
case LLAMA_SPLIT_LAYER: return "layer";
|
||||||
case LLAMA_SPLIT_MODE_ROW: return "row";
|
case LLAMA_SPLIT_ROW: return "row";
|
||||||
default: GGML_ASSERT(!"invalid split mode");
|
default: GGML_ASSERT(!"invalid split mode");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {get_num_physical_cores()},
|
/* n_threads */ {get_num_physical_cores()},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_LAYER},
|
||||||
/* main_gpu */ {0},
|
/* main_gpu */ {0},
|
||||||
/* no_kv_offload */ {false},
|
/* no_kv_offload */ {false},
|
||||||
/* mul_mat_q */ {true},
|
/* mul_mat_q */ {true},
|
||||||
|
@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
for (const auto & m : p) {
|
for (const auto & m : p) {
|
||||||
llama_split_mode mode;
|
llama_split_mode mode;
|
||||||
if (m == "none") {
|
if (m == "none") {
|
||||||
mode = LLAMA_SPLIT_MODE_NONE;
|
mode = LLAMA_SPLIT_NONE;
|
||||||
} else if (m == "layer") {
|
} else if (m == "layer") {
|
||||||
mode = LLAMA_SPLIT_MODE_LAYER;
|
mode = LLAMA_SPLIT_LAYER;
|
||||||
} else if (m == "row") {
|
} else if (m == "row") {
|
||||||
mode = LLAMA_SPLIT_MODE_ROW;
|
mode = LLAMA_SPLIT_ROW;
|
||||||
} else {
|
} else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -2082,9 +2082,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
std::string value(argv[i]);
|
std::string value(argv[i]);
|
||||||
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
|
/**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; }
|
||||||
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
|
else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; }
|
||||||
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
|
else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; }
|
||||||
else { invalid_param = true; break; }
|
else { invalid_param = true; break; }
|
||||||
}
|
}
|
||||||
else if (arg == "--rope-freq-base")
|
else if (arg == "--rope-freq-base")
|
||||||
|
@ -2208,15 +2208,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
std::string arg_next = argv[i];
|
std::string arg_next = argv[i];
|
||||||
if (arg_next == "none")
|
if (arg_next == "none")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_NONE;
|
params.split_mode = LLAMA_SPLIT_NONE;
|
||||||
}
|
}
|
||||||
else if (arg_next == "layer")
|
else if (arg_next == "layer")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_LAYER;
|
params.split_mode = LLAMA_SPLIT_LAYER;
|
||||||
}
|
}
|
||||||
else if (arg_next == "row")
|
else if (arg_next == "row")
|
||||||
{
|
{
|
||||||
params.split_mode = LLAMA_SPLIT_MODE_ROW;
|
params.split_mode = LLAMA_SPLIT_ROW;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
46
llama.cpp
46
llama.cpp
|
@ -850,9 +850,9 @@ struct LLM_TN {
|
||||||
//
|
//
|
||||||
|
|
||||||
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
{ LLAMA_ROPE_SCALING_YARN, "yarn" },
|
||||||
};
|
};
|
||||||
|
|
||||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
|
@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||||
|
@ -1581,7 +1581,7 @@ struct llama_hparams {
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool need_kq_pos = false;
|
bool need_kq_pos = false;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_NONE;
|
||||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
|
@ -3007,7 +3007,7 @@ static void llm_load_hparams(
|
||||||
std::string rope_scaling("linear");
|
std::string rope_scaling("linear");
|
||||||
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
||||||
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
||||||
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
||||||
|
|
||||||
// rope_freq_scale (inverse of the kv) is optional
|
// rope_freq_scale (inverse of the kv) is optional
|
||||||
float ropescale = 0.0f;
|
float ropescale = 0.0f;
|
||||||
|
@ -3655,7 +3655,7 @@ static bool llm_load_tensors(
|
||||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
||||||
// calculate the split points
|
// calculate the split points
|
||||||
int device_count = llama_get_device_count();
|
int device_count = llama_get_device_count();
|
||||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||||
|
@ -3694,10 +3694,10 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_buffer_type_t split_buft;
|
ggml_backend_buffer_type_t split_buft;
|
||||||
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (split_mode == LLAMA_SPLIT_ROW) {
|
||||||
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
|
||||||
split_buft = llama_default_buffer_type_offload(main_gpu);
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
||||||
}
|
}
|
||||||
// assign the repeating layers
|
// assign the repeating layers
|
||||||
|
@ -5028,7 +5028,7 @@ struct llm_build_context {
|
||||||
n_kv (worst_case ? n_ctx : kv_self.n),
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_NONE),
|
||||||
rope_type (hparams.rope_type),
|
rope_type (hparams.rope_type),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx.buf_compute_meta) {
|
||||||
|
@ -6011,12 +6011,12 @@ struct llm_build_context {
|
||||||
cur = inpL;
|
cur = inpL;
|
||||||
|
|
||||||
// pooling layer
|
// pooling layer
|
||||||
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
if (pooling_type == LLAMA_POOLING_MEAN) {
|
||||||
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
||||||
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
} else if (pooling_type == LLAMA_POOLING_CLS) {
|
||||||
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
|
||||||
}
|
}
|
||||||
cb(cur, "result_embd", -1);
|
cb(cur, "result_embd", -1);
|
||||||
|
|
||||||
|
@ -7684,7 +7684,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||||
|
@ -7712,7 +7712,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
@ -11286,7 +11286,7 @@ static int llama_apply_lora_from_file_internal(
|
||||||
struct llama_model_params llama_model_default_params() {
|
struct llama_model_params llama_model_default_params() {
|
||||||
struct llama_model_params result = {
|
struct llama_model_params result = {
|
||||||
/*.n_gpu_layers =*/ 0,
|
/*.n_gpu_layers =*/ 0,
|
||||||
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
/*.split_mode =*/ LLAMA_SPLIT_LAYER,
|
||||||
/*.main_gpu =*/ 0,
|
/*.main_gpu =*/ 0,
|
||||||
/*.tensor_split =*/ nullptr,
|
/*.tensor_split =*/ nullptr,
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
|
@ -11312,7 +11312,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_batch =*/ 512,
|
/*.n_batch =*/ 512,
|
||||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
/*.rope_freq_base =*/ 0.0f,
|
||||||
/*.rope_freq_scale =*/ 0.0f,
|
/*.rope_freq_scale =*/ 0.0f,
|
||||||
/*.yarn_ext_factor =*/ -1.0f,
|
/*.yarn_ext_factor =*/ -1.0f,
|
||||||
|
@ -11500,16 +11500,16 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
||||||
|
|
||||||
auto rope_scaling_type = params.rope_scaling_type;
|
auto rope_scaling_type = params.rope_scaling_type;
|
||||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
|
||||||
rope_scaling_type = hparams.rope_scaling_type_train;
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) {
|
||||||
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
||||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
|
@ -11543,8 +11543,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||||
|
@ -11553,7 +11553,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||||
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
|
|
22
llama.h
22
llama.h
|
@ -114,23 +114,23 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_scaling_type {
|
enum llama_rope_scaling_type {
|
||||||
LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
|
LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
|
||||||
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
|
LLAMA_ROPE_SCALING_NONE = 0,
|
||||||
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
|
LLAMA_ROPE_SCALING_LINEAR = 1,
|
||||||
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
|
LLAMA_ROPE_SCALING_YARN = 2,
|
||||||
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_pooling_type {
|
enum llama_pooling_type {
|
||||||
LLAMA_POOLING_TYPE_NONE = 0,
|
LLAMA_POOLING_NONE = 0,
|
||||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
LLAMA_POOLING_MEAN = 1,
|
||||||
LLAMA_POOLING_TYPE_CLS = 2,
|
LLAMA_POOLING_CLS = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
|
LLAMA_SPLIT_ROW = 2, // split rows across GPUs
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct llama_token_data {
|
typedef struct llama_token_data {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue