llama : ggml-backend integration (#4766)
* llama : ggml-backend integration * ggml-backend : add names to buffers * fix unmap after loading * batched-bench : add tensor_split param * llama : check for null tensor_split * ggml-backend : increase GGML_MAX_BACKENDS * improve graph splitting, partial fix for --no-kv-offload * cuda : add ggml-backend split buffer support * cuda : do not create buffer types for devices that don't exist (fixes usage without CUDA devices available) * ggml : fix null backend dereference (#4807) * ggml : fix null backend dereference * ggml : also check ggml_backend_is_cpu * test-backend-ops : check buffer allocation failures * llama : add cparam (split_mode) and command line argument (--split-mode, -sm) to configure the split mode (none, layer or row) * ggml : fix mul_mat_id work size * llama : rewrite session kv load/set without graphs * minor * llama : only initialize used backends, free backends on context free * llama : abort ctx if cuda backend init fails * llama : rewrite lora with ggml-backend and compute on CPU ggml-ci * llama : only map to a backend buffer the region of the file mapping containing the tensors used in the buffer * opencl : add ggml-backend buffer type * cuda : only use batched_cublas with batched mat muls (fixes fp16 tg perf) * llama : on Metal, by default offload the full model ggml-ci * metal : page align the data ptr (#4854) * Apply suggestions from code review Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * cuda : fix split buffer free * address review comments * llama-bench : add split-mode parameter * fix whitespace * opencl : fix double initialization * server : add --split-mode parameter * use async copy and compute to improve multi-gpu performance ggml-ci * use async memcpys to copy the graph outputs to the CPU * fix opencl * use a host buffer for the cpu compute buffer for faster copies to the gpu --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
584d674be6
commit
e7e4df031b
21 changed files with 2533 additions and 2295 deletions
34
ggml-alloc.c
34
ggml-alloc.c
|
@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d\n", best_fit_block);
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
// the last block is our last resort
|
||||
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
||||
|
@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
||||
void * addr = block->addr;
|
||||
block->addr = (char*)block->addr + size;
|
||||
|
@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
|
||||
|
||||
tensor->data = addr;
|
||||
tensor->buffer = alloc->buffer;
|
||||
if (!alloc->measure) {
|
||||
|
@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||
} else {
|
||||
alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
|
||||
ggml_backend_buffer_reset(alloc->buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
|
|||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
|
||||
// create a backend buffer to get the correct tensor allocation sizes
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
|
||||
|
||||
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
|
||||
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||
|
@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
|
|||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
|
||||
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
|
||||
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
|
||||
// create a backend buffer to get the correct tensor allocation sizes
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
|
||||
alloc->buffer_owned = true;
|
||||
return alloc;
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
|
||||
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
|
||||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
|
||||
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||
|
||||
|
@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
if (nbytes == 0) {
|
||||
// all the tensors in the context are already allocated
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
||||
if (buffer == NULL) {
|
||||
// failed to allocate buffer
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||
|
||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue