llama : initial ggml-backend integration (#4520)

* llama : initial ggml-backend integration

* add ggml-metal

* cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST
access all tensor data with ggml_backend_tensor_get/set

* add ggml_backend_buffer_clear
zero-init KV cache buffer

* add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data

* disable gpu backends with ngl 0

* more accurate mlock

* unmap offloaded part of the model

* use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap

* update quantize and lora

* update session copy/set to use ggml-backend

ggml-ci

* use posix_fadvise instead of posix_fadvise64

* ggml_backend_alloc_ctx_tensors_from_buft : remove old print

* llama_mmap::align_offset : use pointers instead of references for out parameters

* restore progress_callback behavior

* move final progress_callback call to load_all_data

* cuda : fix fprintf format string (minor)

* do not offload scales

* llama_mmap : avoid unmapping the same fragments again in the destructor

* remove unnecessary unmap

* metal : add default log function that prints to stderr, cleanup code

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2023-12-21 21:07:46 +01:00 committed by GitHub
parent 31f27758fa
commit d232aca5a7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 926 additions and 752 deletions

24
ggml.c
View file

@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
size_t max_size = 0;
struct ggml_object * obj = ctx->objects_begin;
while (obj != NULL) {
if (obj->type == GGML_OBJECT_TENSOR) {
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
const size_t size = ggml_nbytes(tensor);
if (max_size < size) {
max_size = size;
}
}
obj = obj->next;
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
max_size = MAX(max_size, ggml_nbytes(tensor));
}
return max_size;
@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
return result;
}
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
struct ggml_object * obj = ctx->objects_begin;
char * const mem_buffer = ctx->mem_buffer;
@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
return NULL;
}
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
obj = obj->next;
@ -19213,6 +19201,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data;
}
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
return ctx->infos[i].type;
}
// returns the index
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
const int idx = gguf_find_key(ctx, key);