llama : initial ggml-backend integration (#4520)
* llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
31f27758fa
commit
d232aca5a7
11 changed files with 926 additions and 752 deletions
24
ggml.c
24
ggml.c
|
@ -2383,20 +2383,8 @@ size_t ggml_get_mem_size(const struct ggml_context * ctx) {
|
|||
size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
||||
size_t max_size = 0;
|
||||
|
||||
struct ggml_object * obj = ctx->objects_begin;
|
||||
|
||||
while (obj != NULL) {
|
||||
if (obj->type == GGML_OBJECT_TENSOR) {
|
||||
struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
|
||||
|
||||
const size_t size = ggml_nbytes(tensor);
|
||||
|
||||
if (max_size < size) {
|
||||
max_size = size;
|
||||
}
|
||||
}
|
||||
|
||||
obj = obj->next;
|
||||
for (struct ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor != NULL; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
||||
max_size = MAX(max_size, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
return max_size;
|
||||
|
@ -3093,7 +3081,7 @@ struct ggml_tensor * ggml_view_tensor(
|
|||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
||||
struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) {
|
||||
struct ggml_object * obj = ctx->objects_begin;
|
||||
|
||||
char * const mem_buffer = ctx->mem_buffer;
|
||||
|
@ -3109,7 +3097,7 @@ struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_get_next_tensor(struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||
struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struct ggml_tensor * tensor) {
|
||||
struct ggml_object * obj = (struct ggml_object *) ((char *)tensor - GGML_OBJECT_SIZE);
|
||||
obj = obj->next;
|
||||
|
||||
|
@ -19213,6 +19201,10 @@ char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
|||
return ctx->infos[i].name.data;
|
||||
}
|
||||
|
||||
enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].type;
|
||||
}
|
||||
|
||||
// returns the index
|
||||
static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
|
||||
const int idx = gguf_find_key(ctx, key);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue