llama : remove LLAMA_MAX_DEVICES from llama.h

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-01-31 15:51:23 +02:00
parent efb7bdbbd0
commit 43312b2039
No known key found for this signature in database
GPG key ID: BF970631944C16B7
6 changed files with 61 additions and 55 deletions

View file

@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
const std::regex regex{R"([,/]+)"}; const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}}; std::vector<std::string> split_arg{it, {}};
if (split_arg.size() >= LLAMA_MAX_DEVICES) { if (split_arg.size() >= llama_max_devices()) {
invalid_param = true; invalid_param = true;
break; break;
} }
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) { if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]); params.tensor_split[i] = std::stof(split_arg[i]);
} else { } else {
@ -1651,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);

View file

@ -63,7 +63,7 @@ struct gpt_params {
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_w = 512; // group-attention width int32_t grp_attn_w = 512; // group-attention width

View file

@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f); const std::vector<float> t_split(llama_max_devices(), 0.0f);
model_params.n_gpu_layers = n_gpu_layers; model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data(); model_params.tensor_split = t_split.data();

View file

@ -160,7 +160,7 @@ struct cmd_params {
std::vector<int> main_gpu; std::vector<int> main_gpu;
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
std::vector<bool> mul_mat_q; std::vector<bool> mul_mat_q;
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split; std::vector<std::vector<float>> tensor_split;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
@ -179,7 +179,7 @@ static const cmd_params cmd_params_defaults = {
/* main_gpu */ {0}, /* main_gpu */ {0},
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
/* mul_mat_q */ {true}, /* mul_mat_q */ {true},
/* tensor_split */ {{}}, /* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -380,10 +380,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
const std::regex regex{R"([;/]+)"}; const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1}; std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
std::vector<std::string> split_arg{it, {}}; std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); GGML_ASSERT(split_arg.size() <= llama_max_devices());
std::array<float, LLAMA_MAX_DEVICES> tensor_split; std::vector<float> tensor_split(llama_max_devices());
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) { if (i < split_arg.size()) {
tensor_split[i] = std::stof(split_arg[i]); tensor_split[i] = std::stof(split_arg[i]);
} else { } else {
@ -459,7 +459,7 @@ struct cmd_params_instance {
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::array<float, LLAMA_MAX_DEVICES> tensor_split; std::vector<float> tensor_split;
llama_model_params to_llama_mparams() const { llama_model_params to_llama_mparams() const {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
@ -582,7 +582,7 @@ struct test {
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
bool mul_mat_q; bool mul_mat_q;
std::array<float, LLAMA_MAX_DEVICES> tensor_split; std::vector<float> tensor_split;
int n_prompt; int n_prompt;
int n_gen; int n_gen;
std::string test_time; std::string test_time;
@ -704,7 +704,7 @@ struct test {
std::vector<std::string> get_values() const { std::vector<std::string> get_values() const {
std::string tensor_split_str; std::string tensor_split_str;
int max_nonzero = 0; int max_nonzero = 0;
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) { for (size_t i = 0; i < llama_max_devices(); i++) {
if (tensor_split[i] > 0) { if (tensor_split[i] > 0) {
max_nonzero = i; max_nonzero = i;
} }

View file

@ -10090,8 +10090,16 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return result; return result;
} }
int32_t llama_max_devices(void) { size_t llama_max_devices(void) {
return LLAMA_MAX_DEVICES; #if defined(GGML_USE_METAL)
return 1;
#elif defined(GGML_USE_CUDA)
return GGML_CUDA_MAX_DEVICES;
#elif defined(GGML_USE_SYCL)
return GGML_SYCL_MAX_DEVICES;
#else
return 1;
#endif
} }
bool llama_mmap_supported(void) { bool llama_mmap_supported(void) {

12
llama.h
View file

@ -5,13 +5,10 @@
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
#elif defined(GGML_USE_SYCL) #elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h" #include "ggml-sycl.h"
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES #endif
#else
#define LLAMA_MAX_DEVICES 1
#endif // GGML_USE_CUBLAS
#include <stddef.h> #include <stddef.h>
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
@ -201,7 +198,7 @@ extern "C" {
// LLAMA_SPLIT_LAYER: ignored // LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu; int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split; const float * tensor_split;
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@ -338,7 +335,8 @@ extern "C" {
LLAMA_API int64_t llama_time_us(void); LLAMA_API int64_t llama_time_us(void);
LLAMA_API int32_t llama_max_devices(void); LLAMA_API size_t llama_max_devices(void);
LLAMA_API bool llama_mmap_supported (void); LLAMA_API bool llama_mmap_supported (void);
LLAMA_API bool llama_mlock_supported(void); LLAMA_API bool llama_mlock_supported(void);