llama : refactor model loader with backend registry (#10026)

This commit is contained in:
Diego Devesa 2024-10-30 02:01:23 +01:00 committed by GitHub
parent 8f275a7c45
commit c5b0f4b5d9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1903 additions and 2019 deletions

View file

@ -22,7 +22,7 @@ extern "C" {
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory (defaults to false)
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};
@ -37,7 +37,6 @@ extern "C" {
//
struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer
@ -88,19 +87,16 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// Will be moved to the device interface
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations
// (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);
// (optional) compute graph with a plan (not used currently)
// (optional) graph plans (not used currently)
// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@ -111,13 +107,6 @@ extern "C" {
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
// record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);