Multi GPU support, CUDA refactor, CUDA scratch buffer (#1703)

* CUDA multi GPU + scratch

ggml_cuda_compute_forward

Tensor parallelism

ggml_cuda_add

ggml_cuda_rms_norm

ggml_cuda_silu

CUDA scratch buffer

--main-gpu CLI option
This commit is contained in:
Johannes Gäßler 2023-06-06 21:33:23 +02:00 committed by GitHub
parent 44f906e853
commit 17366df842
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 1221 additions and 544 deletions

34
ggml.h
View file

@ -256,8 +256,8 @@ extern "C" {
enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_CUDA = 1,
GGML_BACKEND_CL = 2,
GGML_BACKEND_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20,
};
// model file types
@ -387,7 +387,9 @@ extern "C" {
char name[GGML_MAX_NAME];
char padding[16];
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[4];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -425,6 +427,25 @@ extern "C" {
bool no_alloc; // don't allocate memory for the tensor data
};
// compute types
enum ggml_task_type {
GGML_TASK_INIT = 0,
GGML_TASK_COMPUTE,
GGML_TASK_FINALIZE,
};
struct ggml_compute_params {
enum ggml_task_type type;
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
};
// misc
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
@ -436,9 +457,10 @@ extern "C" {
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block