Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
This commit is contained in:
Johannes Gäßler 2023-06-06 21:33:23 +02:00 committed by GitHub
parent 44f906e853
commit 17366df842
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 1221 additions and 544 deletions

View file

@ -21,13 +21,15 @@
int32_t get_num_physical_cores();
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens