llama : initial ggml-backend integration (#4520)
* llama : initial ggml-backend integration * add ggml-metal * cuda backend can be used though ggml-backend with LLAMA_GGML_BACKEND_CUDA_TEST access all tensor data with ggml_backend_tensor_get/set * add ggml_backend_buffer_clear zero-init KV cache buffer * add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data * disable gpu backends with ngl 0 * more accurate mlock * unmap offloaded part of the model * use posix_fadvise64(.., POSIX_FADV_SEQUENTIAL) to improve performance with mmap * update quantize and lora * update session copy/set to use ggml-backend ggml-ci * use posix_fadvise instead of posix_fadvise64 * ggml_backend_alloc_ctx_tensors_from_buft : remove old print * llama_mmap::align_offset : use pointers instead of references for out parameters * restore progress_callback behavior * move final progress_callback call to load_all_data * cuda : fix fprintf format string (minor) * do not offload scales * llama_mmap : avoid unmapping the same fragments again in the destructor * remove unnecessary unmap * metal : add default log function that prints to stderr, cleanup code ggml-ci --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
31f27758fa
commit
d232aca5a7
11 changed files with 926 additions and 752 deletions
|
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue