Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
038d958333
132 changed files with 6559 additions and 5146 deletions
|
@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
|
|||
// Graph allocator
|
||||
/*
|
||||
Example usage:
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||
|
||||
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||
|
|
|
@ -12,43 +12,52 @@ extern "C" {
|
|||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
||||
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
||||
|
||||
|
||||
//
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
// buffer type
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
//
|
||||
// Backend
|
||||
// Backend (stream)
|
||||
//
|
||||
|
||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||
|
@ -64,9 +73,9 @@ extern "C" {
|
|||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||
|
||||
|
@ -76,65 +85,121 @@ extern "C" {
|
|||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// NOTE: will be removed, use device version instead
|
||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// asynchronous copy
|
||||
// the copy is performed after all the currently queued operations in backend_src
|
||||
// backend_dst will wait for the copy to complete before performing other operations
|
||||
// automatic fallback to sync copy if async is not supported
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// events
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
// Events
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
//
|
||||
// Backend device
|
||||
//
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
enum ggml_backend_dev_type {
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
||||
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
||||
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
||||
};
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
// functionality supported by the device
|
||||
struct ggml_backend_dev_caps {
|
||||
// asynchronous operations
|
||||
bool async;
|
||||
// pinned host buffer
|
||||
bool host_buffer;
|
||||
// creating buffers from host ptr
|
||||
bool buffer_from_host_ptr;
|
||||
// event synchronization
|
||||
bool events;
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
// all the device properties
|
||||
struct ggml_backend_dev_props {
|
||||
const char * name;
|
||||
const char * description;
|
||||
size_t memory_free;
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
struct ggml_backend_dev_caps caps;
|
||||
};
|
||||
|
||||
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
||||
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
||||
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||
|
||||
|
||||
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
||||
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||
// Backend (reg) enumeration
|
||||
GGML_API size_t ggml_backend_reg_count(void);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
||||
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
||||
|
||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||
// Device enumeration
|
||||
GGML_API size_t ggml_backend_dev_count(void);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
||||
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
||||
|
||||
// Direct backend (stream) initialization
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
||||
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
//
|
||||
|
||||
// The backend scheduler allows for multiple backends to be used together
|
||||
// The backend scheduler allows for multiple backend devices to be used together
|
||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||
// The backends are selected based on:
|
||||
// - the backend that supports the operation
|
||||
|
@ -169,9 +234,9 @@ extern "C" {
|
|||
}
|
||||
*/
|
||||
|
||||
struct ggml_backend_sched;
|
||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||
//
|
||||
|
@ -185,7 +250,7 @@ extern "C" {
|
|||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||
|
||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||
|
@ -200,7 +265,7 @@ extern "C" {
|
|||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||
|
@ -226,7 +291,7 @@ extern "C" {
|
|||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
@ -235,6 +300,26 @@ extern "C" {
|
|||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -9,13 +9,15 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
|
||||
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for conversion to float
|
||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -44,7 +44,7 @@ extern "C" {
|
|||
* @param device The index of the device to initialize.
|
||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Checks if a given backend is a CANN backend.
|
||||
|
@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
|||
* @param backend The backend instance to check.
|
||||
* @return True if the backend is a CANN backend, false otherwise.
|
||||
*/
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the CANN buffer type for a specified device.
|
||||
|
@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
|||
* @return A pointer to the buffer type interface for the specified device, or
|
||||
* nullptr if the device index is out of range.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t
|
||||
GGML_API ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_buffer_type(int32_t device);
|
||||
|
||||
/**
|
||||
|
@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
|||
*
|
||||
* @return The number of CANN devices available.
|
||||
*/
|
||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
||||
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
|
@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type
|
|||
* @param description Pointer to a buffer where the description will be written.
|
||||
* @param description_size Size of the description buffer.
|
||||
*/
|
||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
||||
GGML_API void ggml_backend_cann_get_device_description(
|
||||
int32_t device, char* description, size_t description_size);
|
||||
|
||||
/**
|
||||
|
@ -112,20 +112,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
|||
* @param total Pointer to a variable where the total memory size will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
size_t* free,
|
||||
size_t* total);
|
||||
|
||||
/**
|
||||
* @brief Set the logging callback for GGML.
|
||||
*
|
||||
* This function sets the logging callback and user data for logging.
|
||||
*
|
||||
* @param log_callback The logging callback to set.
|
||||
* @param user_data User data to pass to the logging callback.
|
||||
*/
|
||||
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
||||
void* user_data);
|
||||
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
size_t* free,
|
||||
size_t* total);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
|
@ -13,35 +17,31 @@
|
|||
#define GGML_CUDA_NAME "CUDA"
|
||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
// device buffer
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||
|
||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
// Note: this description is outdated
|
||||
//
|
||||
// An interface allowing to compute ggml_cgraph with Metal
|
||||
//
|
||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||
|
@ -37,17 +39,17 @@ extern "C" {
|
|||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||
GGML_DEPRECATED(
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||
|
||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
|
@ -57,6 +59,8 @@ GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int fam
|
|||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -10,14 +10,18 @@ extern "C" {
|
|||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
|
||||
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
|||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||
GGML_API int ggml_backend_sycl_get_device_count();
|
||||
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
|
||||
// SYCL doesn't support registering host memory, keep here for reference
|
||||
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -13,16 +13,16 @@ extern "C" {
|
|||
GGML_API void ggml_vk_instance_init(void);
|
||||
|
||||
// backend API
|
||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
|
||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||
GGML_API int ggml_backend_vk_get_device_count(void);
|
||||
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -187,16 +187,6 @@
|
|||
# define GGML_API
|
||||
#endif
|
||||
|
||||
#ifdef GGML_MULTIPLATFORM
|
||||
# if defined(_WIN32)
|
||||
# define GGML_CALL
|
||||
# else
|
||||
# define GGML_CALL __attribute__((__ms_abi__))
|
||||
# endif
|
||||
#else
|
||||
# define GGML_CALL
|
||||
#endif
|
||||
|
||||
// TODO: support for clang
|
||||
#ifdef __GNUC__
|
||||
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||
|
@ -340,7 +330,7 @@ extern "C" {
|
|||
};
|
||||
|
||||
// get ggml_status name string
|
||||
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
||||
GGML_API const char * ggml_status_to_string(enum ggml_status status);
|
||||
|
||||
// ieee 754-2008 half-precision float16
|
||||
// todo: make this not an integral type
|
||||
|
@ -466,6 +456,7 @@ extern "C" {
|
|||
GGML_OP_SUM_ROWS,
|
||||
GGML_OP_MEAN,
|
||||
GGML_OP_ARGMAX,
|
||||
GGML_OP_COUNT_EQUAL,
|
||||
GGML_OP_REPEAT,
|
||||
GGML_OP_REPEAT_BACK,
|
||||
GGML_OP_CONCAT,
|
||||
|
@ -716,46 +707,46 @@ extern "C" {
|
|||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||
|
||||
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
|
||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
GGML_API int64_t ggml_blck_size(enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
"use ggml_row_size() instead");
|
||||
|
||||
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
|
||||
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||
|
||||
// TODO: temporary until model loading of ggml examples is refactored
|
||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||
|
||||
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||
|
||||
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||
|
||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||
|
@ -847,7 +838,7 @@ extern "C" {
|
|||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||
|
@ -1004,6 +995,12 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// count number of equal elements in a and b
|
||||
GGML_API struct ggml_tensor * ggml_count_equal(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// if a is the same shape as b, and a is not parameter, return a
|
||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||
GGML_API struct ggml_tensor * ggml_repeat(
|
||||
|
@ -1561,7 +1558,7 @@ extern "C" {
|
|||
"use ggml_rope_ext_inplace instead");
|
||||
|
||||
// compute correction dims for YaRN RoPE scaling
|
||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
|
@ -2179,6 +2176,10 @@ extern "C" {
|
|||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
// see ggml.c (ggml_opt_default_params) for default values
|
||||
|
@ -2536,7 +2537,7 @@ extern "C" {
|
|||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
typedef struct {
|
||||
struct ggml_type_traits {
|
||||
const char * type_name;
|
||||
int64_t blck_size;
|
||||
int64_t blck_size_interleave; // interleave elements in blocks
|
||||
|
@ -2552,9 +2553,9 @@ extern "C" {
|
|||
int64_t ncols; // number of columns to process simultaneously
|
||||
ggml_gemv_t gemv;
|
||||
ggml_gemm_t gemm;
|
||||
} ggml_type_traits_t;
|
||||
};
|
||||
|
||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
||||
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -163,8 +163,8 @@ if (GGML_OPENMP)
|
|||
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
|
||||
if (GGML_MUSA)
|
||||
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
|
||||
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "OpenMP not found")
|
||||
|
@ -190,22 +190,24 @@ if (GGML_BLAS)
|
|||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||
find_package(PkgConfig REQUIRED)
|
||||
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
|
||||
pkg_check_modules(DepBLAS REQUIRED blas)
|
||||
pkg_check_modules(DepBLAS blas)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
|
||||
pkg_check_modules(DepBLAS openblas64)
|
||||
if (NOT DepBLAS_FOUND)
|
||||
pkg_check_modules(DepBLAS REQUIRED openblas)
|
||||
pkg_check_modules(DepBLAS openblas)
|
||||
endif()
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
||||
pkg_check_modules(DepBLAS REQUIRED blis)
|
||||
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
||||
pkg_check_modules(DepBLAS blis)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
||||
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
|
||||
pkg_check_modules(DepBLAS blas-atlas)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
||||
pkg_check_modules(DepBLAS flexiblas_api)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||
# all Intel* libraries share the same include path
|
||||
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
||||
pkg_check_modules(DepBLAS mkl-sdl)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
||||
# this doesn't provide pkg-config
|
||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||
|
@ -511,8 +513,8 @@ if (GGML_HIPBLAS)
|
|||
endif()
|
||||
|
||||
if (GGML_SYCL)
|
||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
|
||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
|
||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
|
||||
endif()
|
||||
|
||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||
|
@ -532,6 +534,9 @@ if (GGML_SYCL)
|
|||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
||||
|
||||
if (GGML_SYCL_F16)
|
||||
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
|
||||
endif()
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
|
||||
|
@ -543,6 +548,12 @@ if (GGML_SYCL)
|
|||
|
||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
# INFO: Allowed Sub_group_sizes are not consistent through all
|
||||
# hip targets. For example, 64 is used for certain models, but the backend
|
||||
# does not support it.
|
||||
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||
else()
|
||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||
endif()
|
||||
|
@ -576,6 +587,12 @@ if (GGML_SYCL)
|
|||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
||||
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||
if (GGML_SYCL_HIP_TARGET STREQUAL "")
|
||||
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
|
||||
endif()
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
@ -1310,7 +1327,7 @@ add_library(ggml
|
|||
../include/ggml-backend.h
|
||||
ggml.c
|
||||
ggml-alloc.c
|
||||
ggml-backend.c
|
||||
ggml-backend.cpp
|
||||
ggml-quants.c
|
||||
ggml-quants.h
|
||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||
|
@ -1346,6 +1363,10 @@ if (MATH_LIBRARY)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE dl) # Must be linked explicitly
|
||||
endif()
|
||||
|
||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
|
||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
|
||||
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
|
||||
//#define GGML_ALLOCATOR_DEBUG
|
||||
|
||||
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
||||
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#define AT_PRINTF(...)
|
||||
|
||||
|
||||
|
@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
|||
size = GGML_PAD(size, talloc->alignment);
|
||||
|
||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
|
@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
} else {
|
||||
// this should never happen
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
|
@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
|||
}
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].tensor) {
|
||||
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
alloc->allocated_tensors[i].offset,
|
||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
GGML_LOG_DEBUG("\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -768,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
|||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
if (galloc->buffers[i] == NULL) {
|
||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
|
@ -825,14 +825,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
|
|||
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
||||
if (galloc->n_nodes != graph->n_nodes) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
if (galloc->n_leafs != graph->n_leafs) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
@ -843,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|||
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
||||
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
@ -855,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
|||
}
|
||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
@ -869,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
|||
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
||||
if (galloc->n_buffers == 1) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
||||
#endif
|
||||
if (!ggml_gallocr_reserve(galloc, graph)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
@ -940,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
|||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||
if (buffer == NULL) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||
#endif
|
||||
for (size_t i = 0; i < *n_buffers; i++) {
|
||||
ggml_backend_buffer_free((*buffers)[i]);
|
||||
|
@ -990,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
}
|
||||
|
||||
if (this_size > max_size) {
|
||||
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||
__func__, t->name,
|
||||
ggml_backend_buft_name(buft),
|
||||
this_size, max_size);
|
||||
|
@ -1022,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||
|
||||
if (n_buffers == 0) {
|
||||
#ifndef NDEBUG
|
||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -9,145 +9,218 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
// Backend buffer type
|
||||
//
|
||||
|
||||
// buffer type
|
||||
typedef void * ggml_backend_buffer_type_context_t;
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
// allocate a buffer of this type
|
||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
// tensor alignment
|
||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
||||
// max buffer size that can be allocated
|
||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// data size needed to allocate the tensor, including padding
|
||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// check if tensor data is in host memory
|
||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
|
||||
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
|
||||
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
||||
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
||||
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||
// (optional) check if tensor data is in host memory (defaults to false)
|
||||
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_buffer_type_context_t context;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
// buffer
|
||||
typedef void * ggml_backend_buffer_context_t;
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer);
|
||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
||||
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
// (optional) free the buffer
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
// base address of the buffer
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
// tensor data access
|
||||
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
|
||||
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
// clear the entire buffer
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
// (optional) reset any internal state due to tensor initialization, such as tensor extras
|
||||
void (*reset) (ggml_backend_buffer_t buffer);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
struct ggml_backend_buffer_i iface;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_context_t context;
|
||||
void * context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
size_t size);
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
void * context,
|
||||
size_t size);
|
||||
|
||||
// do not use directly, use ggml_backend_tensor_copy instead
|
||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// multi-buffer
|
||||
// buffer that contains a collection of buffers
|
||||
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
|
||||
//
|
||||
// Backend
|
||||
// Backend (stream)
|
||||
//
|
||||
|
||||
typedef void * ggml_backend_context_t;
|
||||
|
||||
struct ggml_backend_i {
|
||||
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
||||
const char * (*get_name)(ggml_backend_t backend);
|
||||
|
||||
void (*GGML_CALL free)(ggml_backend_t backend);
|
||||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// Will be moved to the device interface
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||
|
||||
// (optional) asynchronous tensor data access
|
||||
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
// (optional) complete all pending operations
|
||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
||||
void (*synchronize)(ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan (not used currently)
|
||||
// create a new plan for a graph
|
||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// (optional) compute graph with a plan (not used currently)
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||
// compute the graph with the plan
|
||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||
|
||||
// compute graph without a plan (async)
|
||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
// compute graph (always async if supported by the backend)
|
||||
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
||||
// check if the backend can compute an operation
|
||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend can use tensors allocated in a buffer type
|
||||
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||
// even if the weight has to be copied from the CPU temporarily
|
||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
||||
// new backends should implement the device interface instead
|
||||
// These functions are being moved to the device interface
|
||||
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
// create a new event that can record events on this backend instance
|
||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||
// record an event on the backend instance that created it
|
||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
||||
// wait for an event on on a different backend instance
|
||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// block until an event is recorded
|
||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
||||
// record an event on this stream
|
||||
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||
// wait for an event on on a different stream
|
||||
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
struct ggml_backend {
|
||||
ggml_guid_t guid;
|
||||
|
||||
struct ggml_backend_i iface;
|
||||
ggml_backend_context_t context;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
struct ggml_backend_event {
|
||||
ggml_backend_t backend;
|
||||
struct ggml_backend_device * device;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
// Backend device
|
||||
//
|
||||
|
||||
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
||||
// Note: if additional properties are needed, we should add a struct with all of them
|
||||
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
||||
struct ggml_backend_device_i {
|
||||
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
||||
const char * (*get_name)(ggml_backend_dev_t dev);
|
||||
|
||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||
// device description: short informative description of the device, could be the model name
|
||||
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||
|
||||
// device memory in bytes
|
||||
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||
|
||||
// device type
|
||||
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// device properties
|
||||
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
||||
|
||||
// backend (stream) initialization
|
||||
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
|
||||
|
||||
// preferred buffer type
|
||||
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
|
||||
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
|
||||
|
||||
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
|
||||
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
||||
|
||||
// check if the backend can compute an operation
|
||||
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||
|
||||
// check if the backend can use tensors allocated in a buffer type
|
||||
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
|
||||
|
||||
// (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
|
||||
// these should be expensive operations that may benefit from running on this backend instead of the CPU backend
|
||||
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||
|
||||
// (optional) event synchronization
|
||||
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
||||
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||
};
|
||||
|
||||
struct ggml_backend_device {
|
||||
struct ggml_backend_device_i iface;
|
||||
ggml_backend_reg_t reg;
|
||||
void * context;
|
||||
};
|
||||
|
||||
//
|
||||
// Backend (reg)
|
||||
//
|
||||
|
||||
struct ggml_backend_reg_i {
|
||||
const char * (*get_name)(ggml_backend_reg_t reg);
|
||||
|
||||
// enumerate available devices
|
||||
size_t (*get_device_count)(ggml_backend_reg_t reg);
|
||||
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
|
||||
|
||||
// (optional) get a pointer to a function in the backend
|
||||
// backends can add custom functions that are not part of the standard ggml-backend interface
|
||||
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
||||
};
|
||||
|
||||
struct ggml_backend_reg {
|
||||
// int api_version; // TODO: for dynamic loading
|
||||
struct ggml_backend_reg_i iface;
|
||||
void * context;
|
||||
};
|
||||
|
||||
|
||||
// Internal backend registry API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
||||
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include <future>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
# include <Accelerate/Accelerate.h>
|
||||
|
@ -26,30 +27,6 @@ struct ggml_backend_blas_context {
|
|||
#endif
|
||||
};
|
||||
|
||||
// helper function to determine if it is better to use BLAS or not
|
||||
// for large matrices, BLAS is faster
|
||||
static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// TODO: find the optimal values for these
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
||||
|
||||
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
@ -88,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
|||
|
||||
// convert src0 to float
|
||||
if (type != GGML_TYPE_F32) {
|
||||
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
||||
ggml_to_float_t const to_float = type_traits.to_float;
|
||||
const auto * type_traits = ggml_get_type_traits(type);
|
||||
ggml_to_float_t const to_float = type_traits->to_float;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
|
@ -235,25 +212,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
|
|||
|
||||
// backend interface
|
||||
|
||||
GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
||||
static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
|
||||
return "BLAS";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
|
||||
static void ggml_backend_blas_free(ggml_backend_t backend) {
|
||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||
delete ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
||||
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
|
@ -285,29 +262,8 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
|||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
|
||||
(op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static struct ggml_backend_i blas_backend_i = {
|
||||
/* .get_name = */ ggml_backend_blas_name,
|
||||
/* .get_name = */ ggml_backend_blas_get_name,
|
||||
/* .free = */ ggml_backend_blas_free,
|
||||
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
|
@ -319,14 +275,11 @@ static struct ggml_backend_i blas_backend_i = {
|
|||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_blas_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_blas_guid(void) {
|
||||
|
@ -340,23 +293,24 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
|||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_blas_guid(),
|
||||
/* .interface = */ blas_backend_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
||||
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||
#endif
|
||||
|
||||
return backend;
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
|
||||
bool ggml_backend_is_blas(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
||||
}
|
||||
|
||||
|
@ -366,3 +320,205 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
|
|||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
// device interface
|
||||
|
||||
static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "BLAS";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
return "Accelerate";
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
return "MKL";
|
||||
#elif defined(GGML_BLAS_USE_BLIS)
|
||||
return "BLIS";
|
||||
#elif defined(GGML_BLAS_USE_NVPL)
|
||||
return "NVPL";
|
||||
#elif defined(OPENBLAS_VERSION)
|
||||
return "OpenBLAS";
|
||||
#else
|
||||
return "BLAS";
|
||||
#endif
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_blas_device_get_name(dev);
|
||||
props->description = ggml_backend_blas_device_get_description(dev);
|
||||
props->type = ggml_backend_blas_device_get_type(dev);
|
||||
ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_blas_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
case GGML_OP_VIEW:
|
||||
case GGML_OP_PERMUTE:
|
||||
case GGML_OP_TRANSPOSE:
|
||||
return true;
|
||||
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
// BLAS usually is only faster for large matrices
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = op->ne[0];
|
||||
const int64_t ne1 = op->ne[1];
|
||||
|
||||
// TODO: find the optimal value
|
||||
const int64_t min_batch = 32;
|
||||
|
||||
return ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
|
||||
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||
}
|
||||
|
||||
case GGML_OP_OUT_PROD:
|
||||
return op->src[0]->type == GGML_TYPE_F32 &&
|
||||
op->src[1]->type == GGML_TYPE_F32 &&
|
||||
ggml_is_matrix(src0) &&
|
||||
ggml_is_matrix(src1) &&
|
||||
ggml_is_contiguous(src0) &&
|
||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
|
||||
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||
|
||||
default:
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
||||
/* .get_name = */ ggml_backend_blas_device_get_name,
|
||||
/* .get_description = */ ggml_backend_blas_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_blas_device_get_type,
|
||||
/* .get_props = */ ggml_backend_blas_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_blas_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
|
||||
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// backend reg interface
|
||||
|
||||
static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "BLAS";
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
static ggml_backend_device ggml_backend_blas_device = {
|
||||
/* .iface = */ ggml_backend_blas_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
return &ggml_backend_blas_device;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(index);
|
||||
}
|
||||
|
||||
static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml_backend_blas_set_n_threads;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
GGML_UNUSED(name);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||
/* .get_name = */ ggml_backend_blas_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_blas_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_blas_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_blas_reg = {
|
||||
/* .iface = */ ggml_backend_blas_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_blas_reg;
|
||||
}
|
||||
|
|
|
@ -39,69 +39,6 @@
|
|||
|
||||
#include "ggml-common.h"
|
||||
|
||||
/**
|
||||
* @brief Default logging callback for GGML.
|
||||
*
|
||||
* This function is the default logging callback that logs messages to stderr.
|
||||
*
|
||||
* @param level The log level.
|
||||
* @param msg The log message.
|
||||
* @param user_data User data passed to the callback.
|
||||
*/
|
||||
static void ggml_cann_default_log_callback(enum ggml_log_level level,
|
||||
const char* msg, void* user_data) {
|
||||
GGML_UNUSED(level);
|
||||
GGML_UNUSED(user_data);
|
||||
fprintf(stderr, "%s", msg);
|
||||
}
|
||||
|
||||
ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
|
||||
void* ggml_cann_log_user_data = NULL;
|
||||
|
||||
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
||||
void* user_data) {
|
||||
ggml_cann_log_callback = log_callback;
|
||||
ggml_cann_log_user_data = user_data;
|
||||
}
|
||||
|
||||
#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||
#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||
#define GGML_CANN_LOG_ERROR(...) \
|
||||
ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
|
||||
/**
|
||||
* @brief Log a message using the current logging callback.
|
||||
*
|
||||
* This function formats a log message and passes it to the current logging
|
||||
* callback.
|
||||
*
|
||||
* @param level The log level.
|
||||
* @param format The format string for the log message.
|
||||
* @param ... The arguments for the format string.
|
||||
*/
|
||||
static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
||||
if (ggml_cann_log_callback != NULL) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
|
||||
} else {
|
||||
// vsnprintf adds a null terminator
|
||||
std::vector<char> buffer2(len + 1);
|
||||
va_end(args);
|
||||
va_start(args, format);
|
||||
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
||||
ggml_cann_log_callback(level, buffer2.data(),
|
||||
ggml_cann_log_user_data);
|
||||
}
|
||||
va_end(args);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Handles CANN errors by printing an error message and aborting.
|
||||
*
|
||||
|
@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
|||
int32_t id = -1;
|
||||
aclrtGetDevice(&id);
|
||||
|
||||
GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
|
||||
GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
||||
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
||||
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
||||
file, line);
|
||||
GGML_CANN_LOG_ERROR(" %s\n", stmt);
|
||||
GGML_LOG_ERROR(" %s\n", stmt);
|
||||
// abort with GGML_ASSERT to get a stack trace
|
||||
GGML_ABORT("CANN error");
|
||||
}
|
||||
|
@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
|||
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
||||
|
||||
if (err != ACL_SUCCESS) {
|
||||
GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
||||
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
||||
__func__, aclGetRecentErrMsg());
|
||||
return info;
|
||||
}
|
||||
|
@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
|||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_CANN_LOG_INFO(
|
||||
GGML_LOG_INFO(
|
||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||
"requested %u MB\n",
|
||||
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
||||
|
@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
// add to the pool
|
||||
pool_size += reserve_size;
|
||||
|
||||
// GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
||||
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
||||
// reserved %llu MB)\n",
|
||||
// device, (unsigned long long) (pool_size/1024/1024),
|
||||
// (unsigned long long) (reserve_size/1024/1024));
|
||||
|
@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
pool_used += size;
|
||||
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
||||
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
||||
(unsigned long long)size, (unsigned long long)ptr);
|
||||
#endif
|
||||
return ptr;
|
||||
|
@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
|||
*/
|
||||
void free(void* ptr, size_t size) override {
|
||||
#ifdef DEBUG_CANN_MALLOC
|
||||
GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
||||
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
||||
(unsigned long long)size, (unsigned long long)ptr);
|
||||
#endif
|
||||
|
||||
|
@ -560,7 +497,7 @@ struct ggml_backend_cann_buffer_context {
|
|||
* @return A pointer to a C-string containing the name of the buffer.
|
||||
*/
|
||||
|
||||
GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
||||
static const char* ggml_backend_cann_buffer_get_name(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
return "CANN";
|
||||
|
||||
|
@ -576,7 +513,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
|||
* @param buffer The buffer to check.
|
||||
* @return true if the buffer is a CANN buffer, false otherwise.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_buffer_is_cann(
|
||||
static bool ggml_backend_buffer_is_cann(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
||||
}
|
||||
|
@ -589,7 +526,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
|
|||
*
|
||||
* @param buffer The CANN buffer to free.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
||||
static void ggml_backend_cann_buffer_free_buffer(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||
|
@ -605,7 +542,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
|||
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
||||
* @return A pointer to the base of the device memory allocated for the buffer.
|
||||
*/
|
||||
GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
||||
static void* ggml_backend_cann_buffer_get_base(
|
||||
ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||
|
@ -625,9 +562,9 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
|||
* @param dst Pointer to the destination buffer where transformed data will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
int64_t groups = n_elems / QK4_0;
|
||||
|
@ -677,7 +614,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
|||
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
||||
* will be stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
||||
static void ggml_backend_cann_transform_back_q4_0(
|
||||
const ggml_tensor* tensor, void* src, void* dst) {
|
||||
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
|
@ -726,9 +663,9 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
|||
* @param dst Pointer to the destination buffer where transformed data will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
||||
const void* src,
|
||||
void* dst) {
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
int64_t groups = n_elems / QK8_0;
|
||||
size_t quant_bytes = n_elems * sizeof(uint8_t);
|
||||
|
@ -760,7 +697,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
|||
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
||||
* will be stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
||||
static void ggml_backend_cann_transform_back_q8_0(
|
||||
const ggml_tensor* tensor, const void* src, void* dst) {
|
||||
int64_t n_elems = ggml_nelements(tensor);
|
||||
int64_t groups = n_elems / QK8_0;
|
||||
|
@ -792,8 +729,8 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
|||
* @param dst Pointer to the destination buffer where transformed data will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
||||
const void* src, void* dst) {
|
||||
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
||||
const void* src, void* dst) {
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
ggml_backend_cann_transform_q4_0(tensor, src, dst);
|
||||
|
@ -818,7 +755,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
|||
* @param dst Pointer to the destination buffer where transformed tensor data
|
||||
* will be stored.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_transform_back(
|
||||
static void ggml_backend_cann_transform_back(
|
||||
const ggml_tensor* tensor, void* src, void* dst) {
|
||||
switch (tensor->type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
|
@ -841,7 +778,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
|
|||
* @param type The tensor type to check.
|
||||
* @return true if transformation is needed, false otherwise.
|
||||
*/
|
||||
GGML_CALL static bool need_transform(ggml_type type) {
|
||||
static bool need_transform(ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
|
@ -860,7 +797,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
|
|||
* @param buffer The CANN buffer from which to initialize the tensor.
|
||||
* @param tensor Pointer to the tensor to be initialized.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
||||
static void ggml_backend_cann_buffer_init_tensor(
|
||||
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||
|
@ -896,7 +833,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
|||
* @param offset Offset in the source data from where to start copying.
|
||||
* @param size Size of the data to be copied, in bytes.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
||||
static void ggml_backend_cann_buffer_set_tensor(
|
||||
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) {
|
||||
ggml_backend_cann_buffer_context *ctx =
|
||||
|
@ -941,7 +878,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
|||
* @param offset Offset in the destination buffer where to start copying.
|
||||
* @param size Size of the data to be copied, in bytes.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
||||
static void ggml_backend_cann_buffer_get_tensor(
|
||||
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
||||
size_t offset, size_t size) {
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
|
@ -975,7 +912,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
|||
* @param dst Pointer to the destination tensor where the data will be copied.
|
||||
* @return true if the copy operation succeeded, false otherwise.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
||||
static bool ggml_backend_cann_buffer_cpy_tensor(
|
||||
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
||||
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
||||
ggml_backend_cann_buffer_context* src_ctx =
|
||||
|
@ -1017,7 +954,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
|||
* @param buffer The CANN buffer to be cleared.
|
||||
* @param value The value to which each byte in the buffer will be set.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_buffer_clear(
|
||||
static void ggml_backend_cann_buffer_clear(
|
||||
ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_cann_buffer_context* ctx =
|
||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||
|
@ -1065,7 +1002,7 @@ struct ggml_backend_cann_buffer_type_context {
|
|||
* @param buft Pointer to the buffer type context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
||||
static const char* ggml_backend_cann_buffer_type_name(
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
return "CANN";
|
||||
|
||||
|
@ -1082,7 +1019,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
|||
* @param size Size in bytes of the buffer to allocate.
|
||||
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_t
|
||||
static ggml_backend_buffer_t
|
||||
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) {
|
||||
ggml_backend_cann_buffer_type_context* buft_ctx =
|
||||
|
@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|||
void* dev_ptr;
|
||||
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||
if (err != ACL_SUCCESS) {
|
||||
GGML_CANN_LOG_ERROR(
|
||||
GGML_LOG_ERROR(
|
||||
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
||||
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
||||
aclGetRecentErrMsg());
|
||||
|
@ -1121,7 +1058,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
|||
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
||||
* buffers).
|
||||
*/
|
||||
GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
||||
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
|
||||
|
@ -1142,7 +1079,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
|||
* @return The total allocation size in bytes required for the tensor in the
|
||||
* CANN buffer.
|
||||
*/
|
||||
GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
||||
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
||||
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
@ -1193,7 +1130,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
|
|||
* @return A pointer to the buffer type interface for the specified device, or
|
||||
* nullptr if the device index is out of range.
|
||||
*/
|
||||
GGML_CALL ggml_backend_buffer_type_t
|
||||
ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_buffer_type(int32_t device) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
@ -1231,7 +1168,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
|||
* @param buft Pointer to the host buffer type context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
|
@ -1246,7 +1183,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe
|
|||
* @param buft Pointer to the host buffer context.
|
||||
* @return Const pointer to the C-style string containing the name.
|
||||
*/
|
||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CANN_Host";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
|
@ -1260,7 +1197,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu
|
|||
*
|
||||
* @param buffer The CANN host buffer to free.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||
}
|
||||
|
||||
|
@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||
if (err != ACL_SUCCESS) {
|
||||
|
||||
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -1294,7 +1231,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
|||
* @param size Size in bytes of the host buffer to allocate.
|
||||
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * hostPtr = ggml_cann_host_malloc(size);
|
||||
|
||||
if (hostPtr == nullptr) {
|
||||
|
@ -1316,7 +1253,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_
|
|||
* Provides function pointers for allocating, querying properties, and managing
|
||||
* memory for CANN buffer types in the GGML backend.
|
||||
*/
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||
|
@ -1326,6 +1263,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
|||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
|
@ -1495,7 +1433,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|||
* @param backend Pointer to the CANN backend structure.
|
||||
* @return A pointer to a constant string representing the backend name.
|
||||
*/
|
||||
GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
||||
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
|
@ -1510,7 +1448,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
|||
*
|
||||
* @param backend Pointer to the CANN backend structure to be freed.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
ACL_CHECK(aclrtSynchronizeDevice());
|
||||
|
@ -1535,7 +1473,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
|||
* @param backend Pointer to the CANN backend structure.
|
||||
* @return Pointer to the buffer type structure for the CANN backend.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_buffer_type_t
|
||||
static ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
@ -1556,11 +1494,11 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
|||
* @param offset Offset in bytes within the host data.
|
||||
* @param size Size of the data to copy in bytes.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data,
|
||||
size_t offset,
|
||||
size_t size) {
|
||||
ggml_backend_cann_context *cann_ctx =
|
||||
(ggml_backend_cann_context *)backend->context;
|
||||
|
||||
|
@ -1587,7 +1525,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
||||
static void ggml_backend_cann_get_tensor_async(
|
||||
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) {
|
||||
ggml_backend_cann_context *cann_ctx =
|
||||
|
@ -1626,7 +1564,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
|||
* @param dst Pointer to the destination tensor to copy data to.
|
||||
* @return true if the copy operation succeeds, false otherwise.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
||||
static bool ggml_backend_cann_cpy_tensor_async(
|
||||
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
||||
const ggml_tensor* src, ggml_tensor* dst) {
|
||||
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
||||
|
@ -1694,7 +1632,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
|||
*
|
||||
* @param backend Pointer to the CANN backend structure to synchronize.
|
||||
*/
|
||||
GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
|
@ -1715,7 +1653,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|||
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
||||
* completes successfully, otherwise an appropriate error status.
|
||||
*/
|
||||
GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
@ -1732,7 +1670,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
|||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
|
||||
if (!ok) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
||||
node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
|
@ -1753,7 +1691,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
|||
* @return bool Returns true if the operation is supported by the backend,
|
||||
* otherwise false.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
||||
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
||||
const ggml_tensor* op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
|
@ -1875,7 +1813,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
|||
* @return bool Returns true if the CANN backend supports the buffer type,
|
||||
* otherwise false.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_supports_buft(
|
||||
static bool ggml_backend_cann_supports_buft(
|
||||
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (ggml_backend_buft_is_cann(buft)) {
|
||||
ggml_backend_cann_context * cann_ctx =
|
||||
|
@ -1901,7 +1839,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
|
|||
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||
* false.
|
||||
*/
|
||||
GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
||||
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
||||
const ggml_tensor* op) {
|
||||
const int min_batch_size = 32;
|
||||
GGML_UNUSED(backend);
|
||||
|
@ -2021,11 +1959,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
|
|||
/* .supports_op = */ ggml_backend_cann_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_cann_offload_op,
|
||||
/* .event_new = */ ggml_backend_cann_event_new,
|
||||
/* .event_free = */ ggml_backend_cann_event_free,
|
||||
/* .event_record = */ ggml_backend_cann_event_record,
|
||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||
/* .event_synchronize = */ ggml_backend_cann_event_synchronize,
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -2042,91 +1977,46 @@ static ggml_guid_t ggml_backend_cann_guid() {
|
|||
return &guid;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||
aclInit(nullptr);
|
||||
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
||||
GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
||||
if (ctx == nullptr) {
|
||||
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
ggml_cann_set_device(ctx->device);
|
||||
ggml_backend_t cann_backend =
|
||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||
/* .interface = */ ggml_backend_cann_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ ctx};
|
||||
|
||||
return cann_backend;
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
|
||||
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
||||
return backend != NULL &&
|
||||
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
||||
}
|
||||
|
||||
GGML_CALL int32_t ggml_backend_cann_get_device_count() {
|
||||
int32_t ggml_backend_cann_get_device_count() {
|
||||
return ggml_cann_info().device_count;
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_cann_get_device_description(
|
||||
void ggml_backend_cann_get_device_description(
|
||||
int32_t device, char* description, size_t description_size) {
|
||||
ggml_cann_set_device(device);
|
||||
const char* soc_name = aclrtGetSocName();
|
||||
snprintf(description, description_size, "%s", soc_name);
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
||||
size_t* total) {
|
||||
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
||||
size_t* total) {
|
||||
ggml_cann_set_device(device);
|
||||
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
||||
}
|
||||
|
||||
// backend registry
|
||||
/**
|
||||
* @brief Initializes a CANN backend based on the provided parameters.
|
||||
*
|
||||
* This function initializes a CANN backend using the device index and then
|
||||
* initializes the backend using `ggml_backend_cann_init`.
|
||||
*
|
||||
* @param params Parameters for initialization (unused in this implementation).
|
||||
* @param user_data User data containing the device index to initialize the
|
||||
* backend.
|
||||
* @return ggml_backend_t The initialized CANN backend.
|
||||
*/
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
|
||||
void* user_data) {
|
||||
ggml_backend_t cann_backend =
|
||||
ggml_backend_cann_init((int)(intptr_t)user_data);
|
||||
return cann_backend;
|
||||
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
|
||||
|
||||
/**
|
||||
* @brief Registers CANN (Ascend) devices as backend options.
|
||||
*
|
||||
* This function initializes ACL, retrieves the number of available CANN
|
||||
* devices, and registers each device as a backend option using
|
||||
* `ggml_backend_register`. Each device is given a unique name based on
|
||||
* `GGML_CANN_NAME` followed by its index.
|
||||
*
|
||||
* @return int The number of CANN devices registered.
|
||||
*/
|
||||
GGML_CALL int ggml_backend_cann_reg_devices() {
|
||||
uint32_t device_count = ggml_backend_cann_get_device_count();
|
||||
// initialization
|
||||
for (uint32_t i = 0; i < device_count; i++) {
|
||||
char name[128];
|
||||
snprintf(name, sizeof(name), "CANN%d", i);
|
||||
ggml_backend_register(name, ggml_backend_reg_cann_init,
|
||||
ggml_backend_cann_buffer_type(i),
|
||||
(void*)(intptr_t)i);
|
||||
}
|
||||
return device_count;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
79
ggml/src/ggml-cuda/argmax.cu
Normal file
79
ggml/src/ggml-cuda/argmax.cu
Normal file
|
@ -0,0 +1,79 @@
|
|||
#include "common.cuh"
|
||||
#include "argmax.cuh"
|
||||
#include "sum.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void argmax_f32(
|
||||
const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
|
||||
|
||||
int argmax_thread = 0;
|
||||
const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
|
||||
|
||||
#pragma unroll
|
||||
for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
|
||||
const int64_t row = row0 + row1;
|
||||
|
||||
if (row >= nrows) {
|
||||
break;
|
||||
}
|
||||
|
||||
float maxval = -FLT_MAX;
|
||||
int argmax = -1;
|
||||
|
||||
for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
|
||||
const float val = x[row*ncols + col];
|
||||
const int bigger = val > maxval;
|
||||
const int not_bigger = bigger ^ 0x00000001;
|
||||
|
||||
maxval = maxval*not_bigger + val*bigger;
|
||||
argmax = argmax*not_bigger + col*bigger;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
|
||||
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
|
||||
const int bigger = val > maxval;
|
||||
const int not_bigger = bigger ^ 0x00000001;
|
||||
|
||||
maxval = maxval*not_bigger + val*bigger;
|
||||
argmax = argmax*not_bigger + col*bigger;
|
||||
}
|
||||
|
||||
const int store = row1 == threadIdx.x;
|
||||
argmax_thread += store*argmax;
|
||||
}
|
||||
|
||||
const int row = row0 + threadIdx.x;
|
||||
|
||||
if (row >= nrows) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = argmax_thread;
|
||||
}
|
||||
|
||||
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t nrows = ggml_nrows(src0);
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
int32_t * dst_d = (int32_t *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(num_blocks, 1, 1);
|
||||
|
||||
argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
|
||||
}
|
3
ggml/src/ggml-cuda/argmax.cuh
Normal file
3
ggml/src/ggml-cuda/argmax.cuh
Normal file
|
@ -0,0 +1,3 @@
|
|||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -175,6 +175,18 @@ static __device__ void no_device_code(
|
|||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
return __reduce_add_sync(0xffffffff, x);
|
||||
#else
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||
}
|
||||
return x;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
|
|
64
ggml/src/ggml-cuda/count-equal.cu
Normal file
64
ggml/src/ggml-cuda/count-equal.cu
Normal file
|
@ -0,0 +1,64 @@
|
|||
#include "common.cuh"
|
||||
#include "count-equal.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T>
|
||||
static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) {
|
||||
const int64_t i0 = (int64_t) blockIdx.x*dk;
|
||||
const int64_t i1 = min(i0 + dk, k);
|
||||
|
||||
int nequal = 0;
|
||||
|
||||
for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) {
|
||||
const T xi = x[i];
|
||||
const T yi = y[i];
|
||||
nequal += xi == yi;
|
||||
}
|
||||
|
||||
nequal = warp_reduce_sum(nequal);
|
||||
|
||||
if (threadIdx.x != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
atomicAdd((int *) dst, nequal);
|
||||
}
|
||||
|
||||
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src0->type == src1->type);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_I64);
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||
|
||||
int64_t * dst_d = (int64_t *) dst->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
|
||||
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
|
||||
|
||||
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
|
||||
|
||||
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||
const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1);
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_I32: {
|
||||
const int * src0_d = (const int *) src0->data;
|
||||
const int * src1_d = (const int *) src1->data;
|
||||
count_equal<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_d, dne, ne);
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
5
ggml/src/ggml-cuda/count-equal.cuh
Normal file
5
ggml/src/ggml-cuda/count-equal.cuh
Normal file
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
|
||||
|
||||
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -259,7 +259,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
|||
}
|
||||
|
||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
||||
kqsum_j = warp_reduce_sum((float)kqsum_j);
|
||||
|
||||
#pragma unroll
|
||||
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
|
||||
|
|
|
@ -196,7 +196,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||
sum = warp_reduce_sum(sum);
|
||||
sum = warp_reduce_sum((float)sum);
|
||||
|
||||
if (use_logit_softcap) {
|
||||
sum = logit_softcap*tanhf(sum);
|
||||
|
@ -265,7 +265,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||
kqsum[j] = warp_reduce_sum((float)kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
|
@ -280,7 +280,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
|||
}
|
||||
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
|
||||
|
||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||
if (parallel_blocks == 1) {
|
||||
|
|
|
@ -33,6 +33,21 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
// logging
|
||||
//
|
||||
|
||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||
|
||||
// bitset
|
||||
|
||||
typedef uint32_t ggml_bitset_t;
|
||||
|
|
|
@ -1921,6 +1921,7 @@ ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) {
|
|||
for (const auto & dev : devices) {
|
||||
vec.push_back({
|
||||
/* .iface = */ ggml_backend_kompute_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ new ggml_backend_kompute_buffer_type_context(dev.index, dev.bufferAlignment, dev.maxAlloc)
|
||||
});
|
||||
}
|
||||
|
@ -1989,11 +1990,8 @@ static struct ggml_backend_i kompute_backend_i = {
|
|||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_kompute_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_kompute_guid() {
|
||||
|
@ -2008,6 +2006,7 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
|||
ggml_backend_t kompute_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_kompute_guid(),
|
||||
/* .interface = */ kompute_backend_i,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ s_kompute_context,
|
||||
};
|
||||
|
||||
|
@ -2017,23 +2016,3 @@ ggml_backend_t ggml_backend_kompute_init(int device) {
|
|||
bool ggml_backend_is_kompute(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid());
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_reg_kompute_init(const char * params, void * user_data) {
|
||||
GGML_UNUSED(params);
|
||||
return ggml_backend_kompute_init(intptr_t(user_data));
|
||||
}
|
||||
|
||||
extern "C" int ggml_backend_kompute_reg_devices();
|
||||
|
||||
int ggml_backend_kompute_reg_devices() {
|
||||
auto devices = ggml_vk_available_devices_internal(0);
|
||||
for (const auto & device : devices) {
|
||||
ggml_backend_register(
|
||||
ggml_kompute_format_name(device.index).c_str(),
|
||||
ggml_backend_reg_kompute_init,
|
||||
ggml_backend_kompute_buffer_type(device.index),
|
||||
reinterpret_cast<void *>(intptr_t(device.index))
|
||||
);
|
||||
}
|
||||
return devices.size();
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -25,7 +25,7 @@
|
|||
# include <netdb.h>
|
||||
# include <unistd.h>
|
||||
#endif
|
||||
#include <string.h>
|
||||
#include <cstring>
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
|
@ -319,12 +319,12 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
|||
return sock;
|
||||
}
|
||||
|
||||
GGML_CALL static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
// input serialization format: | remote_ptr (8 bytes) |
|
||||
std::vector<uint8_t> input(sizeof(uint64_t), 0);
|
||||
|
@ -337,7 +337,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t
|
|||
delete ctx;
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
if (ctx->base_cache.find(buffer) != ctx->base_cache.end()) {
|
||||
return ctx->base_cache[buffer];
|
||||
|
@ -388,7 +388,7 @@ static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
|
|||
return result;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||
UNUSED(buffer);
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
// TODO: this check is due to MATRIX_ROW_PADDING in CUDA and should be generalized
|
||||
|
@ -396,7 +396,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_init_tensor(ggml_backend_buffer_t
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | data (size bytes) |
|
||||
size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + size;
|
||||
|
@ -410,7 +410,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t b
|
|||
GGML_ASSERT(status);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
// input serialization format: | rpc_tensor | offset (8 bytes) | size (8 bytes) |
|
||||
int input_size = sizeof(rpc_tensor) + 2*sizeof(uint64_t);
|
||||
|
@ -427,7 +427,7 @@ GGML_CALL static void ggml_backend_rpc_buffer_get_tensor(ggml_backend_buffer_t b
|
|||
memcpy(data, output.data(), size);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
|
||||
// check if src and dst are on the same server
|
||||
ggml_backend_buffer_t src_buffer = src->buffer;
|
||||
ggml_backend_rpc_buffer_context * src_ctx = (ggml_backend_rpc_buffer_context *)src_buffer->context;
|
||||
|
@ -452,7 +452,7 @@ GGML_CALL static bool ggml_backend_rpc_buffer_cpy_tensor(ggml_backend_buffer_t b
|
|||
return output[0];
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
static void ggml_backend_rpc_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
|
||||
// serialization format: | bufptr (8 bytes) | value (1 byte) |
|
||||
int input_size = sizeof(uint64_t) + sizeof(uint8_t);
|
||||
|
@ -477,12 +477,12 @@ static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
|
|||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_rpc_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
return buft_ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
// input serialization format: | size (8 bytes) |
|
||||
int input_size = sizeof(uint64_t);
|
||||
|
@ -522,7 +522,7 @@ static size_t get_alignment(const std::shared_ptr<socket_t> & sock) {
|
|||
return alignment;
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_rpc_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
return buft_ctx->alignment;
|
||||
}
|
||||
|
@ -540,12 +540,12 @@ static size_t get_max_size(const std::shared_ptr<socket_t> & sock) {
|
|||
return max_size;
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
return buft_ctx->max_size;
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
UNUSED(buft);
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
@ -559,24 +559,24 @@ static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
|
|||
/* .is_host = */ NULL,
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||
static const char * ggml_backend_rpc_name(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
|
||||
return rpc_ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
static void ggml_backend_rpc_free(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
delete rpc_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||
static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
|
||||
UNUSED(backend);
|
||||
// this is no-op because we don't have any async operations
|
||||
}
|
||||
|
@ -618,7 +618,7 @@ static void serialize_graph(const ggml_cgraph * cgraph, std::vector<uint8_t> & o
|
|||
memcpy(out_tensors, tensors.data(), n_tensors * sizeof(rpc_tensor));
|
||||
}
|
||||
|
||||
GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
std::vector<uint8_t> input;
|
||||
serialize_graph(cgraph, input);
|
||||
|
@ -630,22 +630,6 @@ GGML_CALL static enum ggml_status ggml_backend_rpc_graph_compute(ggml_backend_t
|
|||
return (enum ggml_status)output[0];
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
UNUSED(backend);
|
||||
UNUSED(op);
|
||||
//TODO: call the remote backend and cache the results
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
|
||||
return buft_ctx->endpoint == rpc_ctx->endpoint;
|
||||
}
|
||||
|
||||
static ggml_backend_i ggml_backend_rpc_interface = {
|
||||
/* .get_name = */ ggml_backend_rpc_name,
|
||||
/* .free = */ ggml_backend_rpc_free,
|
||||
|
@ -659,17 +643,14 @@ static ggml_backend_i ggml_backend_rpc_interface = {
|
|||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ NULL,
|
||||
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
|
||||
/* .supports_op = */ ggml_backend_rpc_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_supports_buft,
|
||||
/* .supports_op = */ NULL,
|
||||
/* .supports_buft = */ NULL,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
// NOTE: buffer types are allocated and never freed; this is by design
|
||||
|
@ -694,13 +675,14 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const
|
|||
|
||||
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
|
||||
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
|
||||
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||
/* .context = */ buft_ctx
|
||||
};
|
||||
buft_map[endpoint] = buft;
|
||||
return buft;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
||||
ggml_backend_rpc_context * ctx = new ggml_backend_rpc_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||
|
@ -709,12 +691,13 @@ GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
|
|||
ggml_backend_t backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_rpc_guid(),
|
||||
/* .interface = */ ggml_backend_rpc_interface,
|
||||
/* .device = */ ggml_backend_rpc_add_device(endpoint),
|
||||
/* .context = */ ctx
|
||||
};
|
||||
return backend;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_rpc_guid());
|
||||
}
|
||||
|
||||
|
@ -734,7 +717,7 @@ static void get_device_memory(const std::shared_ptr<socket_t> & sock, size_t * f
|
|||
*total = total_mem;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total) {
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
*free = 0;
|
||||
|
@ -1190,7 +1173,7 @@ static void rpc_serve_client(ggml_backend_t backend, sockfd_t sockfd, size_t fre
|
|||
}
|
||||
}
|
||||
|
||||
void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
||||
void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
|
||||
std::string host;
|
||||
int port;
|
||||
if (!parse_endpoint(endpoint, host, port)) {
|
||||
|
@ -1227,3 +1210,179 @@ void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free
|
|||
WSACleanup();
|
||||
#endif
|
||||
}
|
||||
|
||||
// device interface
|
||||
|
||||
struct ggml_backend_rpc_device_context {
|
||||
std::string endpoint;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
|
||||
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
|
||||
// TODO: obtain value from the server
|
||||
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
|
||||
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_rpc_device_get_name(dev);
|
||||
props->description = ggml_backend_rpc_device_get_description(dev);
|
||||
props->type = ggml_backend_rpc_device_get_type(dev);
|
||||
ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ false,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
return ggml_backend_rpc_init(ctx->endpoint.c_str());
|
||||
|
||||
UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
|
||||
return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
|
||||
|
||||
UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
UNUSED(dev);
|
||||
UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
UNUSED(dev);
|
||||
UNUSED(op);
|
||||
//TODO: call the remote backend and cache the results
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
|
||||
ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
|
||||
return buft_ctx->endpoint == dev_ctx->endpoint;
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
|
||||
/* .get_name = */ ggml_backend_rpc_device_get_name,
|
||||
/* .get_description = */ ggml_backend_rpc_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_rpc_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_rpc_device_get_type,
|
||||
/* .get_props = */ ggml_backend_rpc_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_rpc_device_init,
|
||||
/* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
|
||||
/* .supports_op = */ ggml_backend_rpc_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// backend reg interface
|
||||
|
||||
static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "RPC";
|
||||
|
||||
UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 0;
|
||||
|
||||
UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
|
||||
|
||||
UNUSED(reg);
|
||||
UNUSED(index);
|
||||
}
|
||||
|
||||
static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
|
||||
return (void *)ggml_backend_rpc_add_device;
|
||||
}
|
||||
return NULL;
|
||||
|
||||
UNUSED(reg);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
|
||||
/* .get_name = */ ggml_backend_rpc_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_rpc_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_rpc_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_rpc_reg = {
|
||||
/* .iface = */ ggml_backend_rpc_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_rpc_reg;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
|
||||
static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
|
||||
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
if (dev_map.find(endpoint) != dev_map.end()) {
|
||||
return dev_map[endpoint];
|
||||
}
|
||||
|
||||
ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
|
||||
/* .endpoint = */ endpoint,
|
||||
/* .name = */ "RPC[" + std::string(endpoint) + "]",
|
||||
};
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_rpc_device_i,
|
||||
/* .reg = */ ggml_backend_rpc_reg(),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
dev_map[endpoint] = dev;
|
||||
|
||||
return dev;
|
||||
}
|
||||
|
|
|
@ -4038,7 +4038,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
|
|||
return true;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
||||
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n");
|
||||
for(int i=0;i<max_len;i++) id_list[i] = -1;
|
||||
|
||||
|
@ -4068,7 +4068,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description,
|
||||
GGML_API void ggml_sycl_get_device_description(int device, char *description,
|
||||
size_t description_size) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_device_description\n");
|
||||
dpct::device_info prop;
|
||||
|
@ -4082,7 +4082,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
||||
void ggml_backend_sycl_get_device_memory(int device, size_t *free,
|
||||
size_t *total) try {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n");
|
||||
ggml_sycl_set_device(device);
|
||||
|
@ -4135,12 +4135,12 @@ struct ggml_backend_sycl_buffer_context {
|
|||
}
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
||||
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
|
||||
}
|
||||
|
||||
|
@ -4162,7 +4162,7 @@ static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||
return ctx->dev_ptr;
|
||||
}
|
||||
|
||||
GGML_CALL static void
|
||||
static void
|
||||
ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
|
||||
|
@ -4237,7 +4237,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static bool
|
||||
static bool
|
||||
ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
|
@ -4339,12 +4339,12 @@ struct ggml_backend_sycl_buffer_type_context {
|
|||
queue_ptr stream = nullptr;
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
GGML_CALL static ggml_backend_buffer_t
|
||||
static ggml_backend_buffer_t
|
||||
ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||
size_t size) try {
|
||||
ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
|
||||
|
@ -4368,7 +4368,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
@ -4379,7 +4379,7 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ
|
|||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
|
@ -4424,6 +4424,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
|||
queue_ptr stream = &(device_i.default_queue());
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), stream},
|
||||
};
|
||||
}
|
||||
|
@ -4449,6 +4450,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
|
|||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i), ctx->stream(i, 0)},
|
||||
};
|
||||
}
|
||||
|
@ -4513,7 +4515,7 @@ struct ggml_backend_sycl_split_buffer_context {
|
|||
std::vector<queue_ptr> streams;
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_SYCL_NAME "_Split";
|
||||
|
||||
UNUSED(buffer);
|
||||
|
@ -4523,19 +4525,19 @@ static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
|
|||
return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
// the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
|
||||
return (void *)0x1000;
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
GGML_CALL static void
|
||||
static void
|
||||
ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor) try {
|
||||
GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
|
||||
|
@ -4618,7 +4620,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static void
|
||||
static void
|
||||
ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer,
|
||||
ggml_tensor *tensor, const void *data,
|
||||
size_t offset, size_t size) try {
|
||||
|
@ -4671,7 +4673,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static void
|
||||
static void
|
||||
ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer,
|
||||
const ggml_tensor *tensor, void *data,
|
||||
size_t offset, size_t size) try {
|
||||
|
@ -4724,7 +4726,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
UNUSED(buffer);
|
||||
UNUSED(value);
|
||||
}
|
||||
|
@ -4742,13 +4744,13 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
|
|||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_SYCL_NAME "_Split";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
|
||||
// instead, we allocate them for each tensor separately in init_tensor
|
||||
// however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
|
||||
|
@ -4758,12 +4760,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc
|
|||
return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
return 128;
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context;
|
||||
|
||||
size_t total_size = 0;
|
||||
|
@ -4790,7 +4792,7 @@ GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_
|
|||
return total_size;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||
return false;
|
||||
|
||||
UNUSED(buft);
|
||||
|
@ -4805,7 +4807,7 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface
|
|||
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
|
||||
};
|
||||
|
||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) {
|
||||
static std::mutex mutex;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
|
||||
|
@ -4837,6 +4839,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
|
|||
|
||||
struct ggml_backend_buffer_type buft {
|
||||
/* .iface = */ ggml_backend_sycl_split_buffer_type_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr},
|
||||
};
|
||||
|
||||
|
@ -4846,13 +4849,13 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const f
|
|||
|
||||
// host buffer type
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return GGML_SYCL_NAME "_Host";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return GGML_SYCL_NAME "_Host";
|
||||
|
||||
UNUSED(buffer);
|
||||
|
@ -4890,6 +4893,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||
},
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ nullptr,
|
||||
};
|
||||
|
||||
|
@ -4898,14 +4902,14 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
|
|||
|
||||
// backend
|
||||
|
||||
GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
|
||||
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
return sycl_ctx->name.c_str();
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
||||
static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
|
||||
delete sycl_ctx;
|
||||
|
@ -4913,12 +4917,12 @@ GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) {
|
|||
}
|
||||
|
||||
|
||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
return ggml_backend_sycl_buffer_type(sycl_ctx->device);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
|
||||
ggml_tensor *tensor,
|
||||
const void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
@ -4936,7 +4940,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *tensor,
|
||||
void *data, size_t offset,
|
||||
size_t size) try {
|
||||
|
@ -4954,9 +4958,9 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend,
|
||||
const ggml_tensor *src,
|
||||
ggml_tensor *dst) try {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) {
|
||||
/*
|
||||
|
@ -4991,7 +4995,7 @@ catch (sycl::exception const &exc) {
|
|||
std::exit(1);
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||
|
||||
|
@ -5019,7 +5023,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
|
|||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
|
@ -5166,13 +5170,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
|||
UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
const int min_batch_size = 32;
|
||||
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID;
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||
if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
|
||||
return false;
|
||||
}
|
||||
|
@ -5197,11 +5201,8 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
|||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_sycl_supports_buft,
|
||||
/* .offload_op = */ ggml_backend_sycl_offload_op,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_sycl_guid() {
|
||||
|
@ -5209,7 +5210,7 @@ static ggml_guid_t ggml_backend_sycl_guid() {
|
|||
return &guid;
|
||||
}
|
||||
|
||||
GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
ggml_backend_t ggml_backend_sycl_init(int device) {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n");
|
||||
ggml_check_sycl();
|
||||
|
||||
|
@ -5224,6 +5225,7 @@ GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) {
|
|||
ggml_backend_t sycl_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_sycl_guid(),
|
||||
/* .interface = */ ggml_backend_sycl_interface,
|
||||
/* .device = */ nullptr,
|
||||
/* .context = */ ctx
|
||||
};
|
||||
|
||||
|
@ -5234,26 +5236,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) {
|
|||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid());
|
||||
}
|
||||
|
||||
GGML_CALL int ggml_backend_sycl_get_device_count() {
|
||||
int ggml_backend_sycl_get_device_count() {
|
||||
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n");
|
||||
return ggml_sycl_info().device_count;
|
||||
}
|
||||
|
||||
GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
|
||||
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
|
||||
return sycl_backend;
|
||||
|
||||
UNUSED(params);
|
||||
}
|
||||
|
||||
extern "C" int ggml_backend_sycl_reg_devices();
|
||||
|
||||
int ggml_backend_sycl_reg_devices() {
|
||||
assert(ggml_sycl_info().device_count>0);
|
||||
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
|
||||
char name[128];
|
||||
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
|
||||
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
|
||||
}
|
||||
return ggml_sycl_info().device_count;
|
||||
}
|
||||
|
|
|
@ -55,12 +55,12 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib,
|
|||
#ifdef GGML_SYCL_F16
|
||||
// v = v * {d, d};
|
||||
// v = v + {m, m};
|
||||
v.s0() = (v.s0() * d) + m;
|
||||
v.s1() = (v.s1() * d) + m;
|
||||
v.s0() = sycl::fma(v.s0(), d, m);
|
||||
v.s1() = sycl::fma(v.s1(), d, m);
|
||||
|
||||
#else
|
||||
v.x() = (v.x() * d) + m;
|
||||
v.y() = (v.y() * d) + m;
|
||||
v.x() = sycl::fma(v.x(), d, m);
|
||||
v.y() = sycl::fma(v.y(), d, m);
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
|
@ -110,11 +110,11 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int64_t ib,
|
|||
#ifdef GGML_SYCL_F16
|
||||
// v = v * {d, d};
|
||||
// v = v + {m, m};
|
||||
v.s0() = (v.s0() * d) + m;
|
||||
v.s1() = (v.s1() * d) + m;
|
||||
v.s0() = sycl::fma(v.s0(), d, m);
|
||||
v.s1() = sycl::fma(v.s1(), d, m);
|
||||
#else
|
||||
v.x() = (v.x() * d) + m;
|
||||
v.y() = (v.y() * d) + m;
|
||||
v.x() = sycl::fma(v.x(), d, m);
|
||||
v.y() = sycl::fma(v.y(), d, m);
|
||||
#endif // GGML_SYCL_F16
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
331
ggml/src/ggml.c
331
ggml/src/ggml.c
|
@ -319,26 +319,63 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||
// logging
|
||||
//
|
||||
|
||||
struct ggml_logger_state {
|
||||
ggml_log_callback log_callback;
|
||||
void * log_callback_user_data;
|
||||
};
|
||||
static struct ggml_logger_state g_logger_state = {ggml_log_callback_default, NULL};
|
||||
|
||||
static void ggml_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
|
||||
if (format == NULL)
|
||||
return;
|
||||
va_list args_copy;
|
||||
va_copy(args_copy, args);
|
||||
char buffer[128];
|
||||
int len = vsnprintf(buffer, 128, format, args);
|
||||
if (len < 128) {
|
||||
g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
|
||||
} else {
|
||||
char * buffer2 = (char *) calloc(len + 1, sizeof(char));
|
||||
vsnprintf(buffer2, len + 1, format, args_copy);
|
||||
buffer2[len] = 0;
|
||||
g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
|
||||
free(buffer2);
|
||||
}
|
||||
va_end(args_copy);
|
||||
}
|
||||
|
||||
void ggml_log_internal(enum ggml_log_level level, const char * format, ...) {
|
||||
va_list args;
|
||||
va_start(args, format);
|
||||
ggml_log_internal_v(level, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
|
||||
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
fputs(text, stderr);
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
#if (GGML_DEBUG >= 1)
|
||||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
|
||||
#define GGML_PRINT_DEBUG(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 5)
|
||||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
|
||||
#define GGML_PRINT_DEBUG_5(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_5(...)
|
||||
#endif
|
||||
|
||||
#if (GGML_DEBUG >= 10)
|
||||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
|
||||
#define GGML_PRINT_DEBUG_10(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||
#else
|
||||
#define GGML_PRINT_DEBUG_10(...)
|
||||
#endif
|
||||
|
||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||
|
||||
//
|
||||
// end of logging block
|
||||
//
|
||||
|
@ -355,7 +392,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
|
|||
#else
|
||||
inline static void * ggml_aligned_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * aligned_memory = NULL;
|
||||
|
@ -377,7 +414,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
error_desc = "insufficient memory";
|
||||
break;
|
||||
}
|
||||
GGML_PRINT("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_LOG_ERROR("%s: %s (attempted to allocate %6.2f MB)\n", __func__, error_desc, size/(1024.0*1024.0));
|
||||
GGML_ABORT("fatal error");
|
||||
return NULL;
|
||||
}
|
||||
|
@ -393,12 +430,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
|
||||
inline static void * ggml_malloc(size_t size) {
|
||||
if (size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_malloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = malloc(size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
return result;
|
||||
|
@ -407,12 +444,12 @@ inline static void * ggml_malloc(size_t size) {
|
|||
// calloc
|
||||
inline static void * ggml_calloc(size_t num, size_t size) {
|
||||
if (num == 0 || size == 0) {
|
||||
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
||||
GGML_LOG_WARN("Behavior may be unexpected when allocating 0 bytes for ggml_calloc!\n");
|
||||
return NULL;
|
||||
}
|
||||
void * result = calloc(num, size);
|
||||
if (result == NULL) {
|
||||
GGML_PRINT("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_LOG_ERROR("%s: failed to allocate %6.2f MB\n", __func__, size/(1024.0*1024.0));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
return result;
|
||||
|
@ -461,7 +498,7 @@ struct ggml_arm_arch_features_type {
|
|||
} ggml_arm_arch_features = {-1, -1, -1, 0};
|
||||
#endif
|
||||
|
||||
GGML_CALL const char * ggml_status_to_string(enum ggml_status status) {
|
||||
const char * ggml_status_to_string(enum ggml_status status) {
|
||||
switch (status) {
|
||||
case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)";
|
||||
case GGML_STATUS_FAILED: return "GGML status: error (operation failed)";
|
||||
|
@ -692,7 +729,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float *
|
|||
static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc);
|
||||
static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc);
|
||||
|
||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
.type_name = "i8",
|
||||
.blck_size = 1,
|
||||
|
@ -1114,9 +1151,9 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|||
};
|
||||
|
||||
// For internal test use
|
||||
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
||||
const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
|
||||
GGML_ASSERT(type < GGML_TYPE_COUNT);
|
||||
return type_traits[type];
|
||||
return &type_traits[type];
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -2957,6 +2994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"SUM_ROWS",
|
||||
"MEAN",
|
||||
"ARGMAX",
|
||||
"COUNT_EQUAL",
|
||||
"REPEAT",
|
||||
"REPEAT_BACK",
|
||||
"CONCAT",
|
||||
|
@ -3030,7 +3068,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"OPT_STEP_ADAMW",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -3051,6 +3089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"Σx_k",
|
||||
"Σx/n",
|
||||
"argmax(x)",
|
||||
"count_equal(x)",
|
||||
"repeat(x)",
|
||||
"repeat_back(x)",
|
||||
"concat(x, y)",
|
||||
|
@ -3124,7 +3163,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"adamw(x)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 80, "GGML_OP_COUNT != 80");
|
||||
static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -3347,7 +3386,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
|||
if (fptr != NULL) {
|
||||
char buf[42];
|
||||
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
|
||||
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
||||
GGML_LOG_WARN("/proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
|
||||
}
|
||||
fclose(fptr);
|
||||
}
|
||||
|
@ -3365,36 +3404,36 @@ bool ggml_is_numa(void) {
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_print_object(const struct ggml_object * obj) {
|
||||
GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
|
||||
GGML_LOG_INFO(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
|
||||
obj->type, obj->offs, obj->size, (const void *) obj->next);
|
||||
}
|
||||
|
||||
void ggml_print_objects(const struct ggml_context * ctx) {
|
||||
struct ggml_object * obj = ctx->objects_begin;
|
||||
|
||||
GGML_PRINT("%s: objects in context %p:\n", __func__, (const void *) ctx);
|
||||
GGML_LOG_INFO("%s: objects in context %p:\n", __func__, (const void *) ctx);
|
||||
|
||||
while (obj != NULL) {
|
||||
ggml_print_object(obj);
|
||||
obj = obj->next;
|
||||
}
|
||||
|
||||
GGML_PRINT("%s: --- end ---\n", __func__);
|
||||
GGML_LOG_INFO("%s: --- end ---\n", __func__);
|
||||
}
|
||||
|
||||
GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
}
|
||||
|
||||
GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
int64_t ggml_nrows(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
|
||||
}
|
||||
|
||||
GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
|
||||
size_t nbytes;
|
||||
size_t blck_size = ggml_blck_size(tensor->type);
|
||||
if (blck_size == 1) {
|
||||
|
@ -3417,15 +3456,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|||
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
||||
}
|
||||
|
||||
GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
|
||||
int64_t ggml_blck_size(enum ggml_type type) {
|
||||
return type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
GGML_CALL size_t ggml_type_size(enum ggml_type type) {
|
||||
size_t ggml_type_size(enum ggml_type type) {
|
||||
return type_traits[type].type_size;
|
||||
}
|
||||
|
||||
GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
assert(ne % ggml_blck_size(type) == 0);
|
||||
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||
}
|
||||
|
@ -3434,15 +3473,15 @@ double ggml_type_sizef(enum ggml_type type) {
|
|||
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
GGML_CALL const char * ggml_type_name(enum ggml_type type) {
|
||||
const char * ggml_type_name(enum ggml_type type) {
|
||||
return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
|
||||
bool ggml_is_quantized(enum ggml_type type) {
|
||||
return type_traits[type].is_quantized;
|
||||
}
|
||||
|
||||
GGML_CALL const char * ggml_op_name(enum ggml_op op) {
|
||||
const char * ggml_op_name(enum ggml_op op) {
|
||||
return GGML_OP_NAME[op];
|
||||
}
|
||||
|
||||
|
@ -3454,7 +3493,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
|||
return GGML_UNARY_OP_NAME[op];
|
||||
}
|
||||
|
||||
GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
if (t->op == GGML_OP_UNARY) {
|
||||
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||
return ggml_unary_op_name(uop);
|
||||
|
@ -3462,7 +3501,7 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
|
|||
return ggml_op_name(t->op);
|
||||
}
|
||||
|
||||
GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
return ggml_type_size(tensor->type);
|
||||
}
|
||||
|
||||
|
@ -3555,7 +3594,7 @@ size_t ggml_tensor_overhead(void) {
|
|||
return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
|
||||
return tensor->nb[0] > tensor->nb[1];
|
||||
}
|
||||
|
||||
|
@ -3581,23 +3620,23 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
|
|||
return true;
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_0(tensor);
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_n(tensor, 0);
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_n(tensor, 1);
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||
return ggml_is_contiguous_n(tensor, 2);
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||
|
||||
return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
|
||||
|
@ -3612,7 +3651,7 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
|
|||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||
}
|
||||
|
||||
GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
||||
bool ggml_is_empty(const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
|
||||
if (tensor->ne[i] == 0) {
|
||||
// empty if any dimension has no elements
|
||||
|
@ -3962,7 +4001,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
|||
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
|
||||
|
||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||
assert(false);
|
||||
return NULL;
|
||||
|
@ -4026,7 +4065,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||
if (ctx->scratch.data != NULL) {
|
||||
// allocate tensor data in the scratch buffer
|
||||
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
||||
GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
||||
GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
||||
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
||||
assert(false);
|
||||
return NULL;
|
||||
|
@ -4195,9 +4234,13 @@ static void ggml_set_op_params_f32(struct ggml_tensor * tensor, uint32_t i, floa
|
|||
}
|
||||
|
||||
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
|
||||
if (ggml_is_empty(tensor)) {
|
||||
return tensor;
|
||||
}
|
||||
if (tensor->buffer) {
|
||||
ggml_backend_tensor_memset(tensor, 0, 0, ggml_nbytes(tensor));
|
||||
} else {
|
||||
GGML_ASSERT(tensor->data);
|
||||
memset(tensor->data, 0, ggml_nbytes(tensor));
|
||||
}
|
||||
return tensor;
|
||||
|
@ -4628,7 +4671,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
|
|||
return (float *)(tensor->data);
|
||||
}
|
||||
|
||||
GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||
enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
|
||||
GGML_ASSERT(tensor->op == GGML_OP_UNARY);
|
||||
return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
|
||||
}
|
||||
|
@ -5181,6 +5224,23 @@ struct ggml_tensor * ggml_argmax(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_count_equal
|
||||
|
||||
struct ggml_tensor * ggml_count_equal(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I64, 1);
|
||||
|
||||
result->op = GGML_OP_COUNT_EQUAL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_repeat
|
||||
|
||||
struct ggml_tensor * ggml_repeat(
|
||||
|
@ -10816,6 +10876,86 @@ static void ggml_compute_forward_argmax(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_count_equal
|
||||
|
||||
static void ggml_compute_forward_count_equal_i32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
GGML_ASSERT(ggml_is_scalar(dst));
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_I64);
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
int64_t * sums = (int64_t *) params->wdata;
|
||||
int64_t sum_thread = 0;
|
||||
|
||||
// rows per thread
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir / (ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne03) / ne01;
|
||||
const int64_t i01 = ir - i03*ne03 - i02*ne02;
|
||||
|
||||
const char * data0 = (const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
const char * data1 = (const char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11;
|
||||
|
||||
for (int64_t i00 = 0; i00 < ne00; ++i00) {
|
||||
const int32_t val0 = *((const int32_t *) (data0 + i00*nb00));
|
||||
const int32_t val1 = *((const int32_t *) (data1 + i00*nb10));
|
||||
|
||||
sum_thread += val0 == val1;
|
||||
}
|
||||
}
|
||||
if (ith != 0) {
|
||||
sums[ith] = sum_thread;
|
||||
}
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int ith_other = 1; ith_other < nth; ++ith_other) {
|
||||
sum_thread += sums[ith_other];
|
||||
}
|
||||
*((int64_t *) dst->data) = sum_thread;
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_count_equal(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_I32:
|
||||
{
|
||||
ggml_compute_forward_count_equal_i32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_repeat
|
||||
|
||||
static void ggml_compute_forward_repeat_f32(
|
||||
|
@ -12779,6 +12919,10 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
|
@ -14108,7 +14252,7 @@ static void ggml_rope_cache_init(
|
|||
}
|
||||
}
|
||||
|
||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
|
@ -16938,41 +17082,40 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
GGML_ASSERT(ggml_is_scalar(dst));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
|
||||
GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
GGML_ASSERT(ggml_is_scalar(dst));
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
const int64_t nc = src0->ne[0];
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
float * sums = (float *) params->wdata;
|
||||
|
||||
// TODO: handle transposed/permuted matrices
|
||||
const int nc = src0->ne[0];
|
||||
const int nr = ggml_nrows(src0);
|
||||
float * sums = (float *) params->wdata;
|
||||
float * st = ((float *) params->wdata) + nth + ith*nc;
|
||||
float sum_thread = 0.0f;
|
||||
|
||||
GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc));
|
||||
|
||||
if (ith == 0) {
|
||||
memset(sums, 0, sizeof(float) * (nth + nth * nc));
|
||||
}
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
// rows per thread
|
||||
const int dr = (nr + nth - 1)/nth;
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
float * s0 = (float *)((char *) src0->data + i1*src0->nb[1]);
|
||||
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
||||
float * st = ((float *) params->wdata) + nth + ith*nc;
|
||||
for (int64_t i1 = ir0; i1 < ir1; ++i1) {
|
||||
const float * s0 = (const float *)((const char *) src0->data + i1*src0->nb[1]);
|
||||
const float * s1 = (const float *)((const char *) src1->data + i1*src1->nb[1]);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
for (int64_t i = 0; i < nc; ++i) {
|
||||
//printf("p[%d] = %f\n", i, p[i]);
|
||||
assert(!isnan(s0[i]));
|
||||
assert(!isnan(s1[i]));
|
||||
|
@ -16981,23 +17124,24 @@ static void ggml_compute_forward_cross_entropy_loss_f32(
|
|||
|
||||
float max = -INFINITY;
|
||||
ggml_vec_max_f32(nc, &max, s0);
|
||||
ggml_float sum = ggml_vec_log_soft_max_f32(nc, st, s0, max);
|
||||
assert(sum >= 0.0);
|
||||
const ggml_float sum_softmax = ggml_vec_log_soft_max_f32(nc, st, s0, max);
|
||||
assert(sum_softmax >= 0.0);
|
||||
|
||||
ggml_vec_add1_f32(nc, st, st, -sum);
|
||||
ggml_vec_add1_f32(nc, st, st, -sum_softmax);
|
||||
ggml_vec_mul_f32(nc, st, st, s1);
|
||||
|
||||
float st_sum = 0.0f;
|
||||
ggml_vec_sum_f32(nc, &st_sum, st);
|
||||
sums[ith] += st_sum;
|
||||
float sum_st = 0.0f;
|
||||
ggml_vec_sum_f32(nc, &sum_st, st);
|
||||
sum_thread += sum_st;
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
for (int64_t i = 0; i < nc; ++i) {
|
||||
assert(!isnan(st[i]));
|
||||
assert(!isinf(st[i]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
sums[ith] = sum_thread;
|
||||
ggml_barrier(params->threadpool);
|
||||
|
||||
if (ith == 0) {
|
||||
|
@ -17063,7 +17207,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
float * s1 = (float *)((char *) src1->data + i1*src1->nb[1]);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
for (int64_t i = 0; i < nc; ++i) {
|
||||
//printf("p[%d] = %f\n", i, p[i]);
|
||||
assert(!isnan(s0[i]));
|
||||
assert(!isnan(s1[i]));
|
||||
|
@ -17082,7 +17226,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
ggml_vec_scale_f32(nc, ds0, d_by_nr);
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
for (int64_t i = 0; i < nc; ++i) {
|
||||
assert(!isnan(ds0[i]));
|
||||
assert(!isinf(ds0[i]));
|
||||
}
|
||||
|
@ -17270,6 +17414,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_argmax(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
{
|
||||
ggml_compute_forward_count_equal(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
{
|
||||
ggml_compute_forward_repeat(params, tensor);
|
||||
|
@ -18020,6 +18168,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
} break;
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_ARGMAX:
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
{
|
||||
GGML_ABORT("fatal error"); // TODO: implement
|
||||
}
|
||||
|
@ -18793,6 +18942,10 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph *
|
|||
for (int i = 0; i < gf->n_nodes; ++i) {
|
||||
struct ggml_tensor * node = gf->nodes[i];
|
||||
|
||||
if (node->type == GGML_TYPE_I32) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool needs_grad = node->flags & GGML_TENSOR_FLAG_PARAM;
|
||||
bool ignore_src[GGML_MAX_SRC] = {false};
|
||||
switch (node->op) {
|
||||
|
@ -19196,6 +19349,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_MEAN:
|
||||
case GGML_OP_ARGMAX:
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_REPEAT_BACK:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
|
@ -19694,6 +19854,10 @@ struct ggml_cplan ggml_graph_plan(
|
|||
cur = ggml_type_size(GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_COUNT_EQUAL:
|
||||
{
|
||||
cur = ggml_type_size(node->type)*n_tasks;
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
|
||||
|
@ -20137,7 +20301,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
|
|||
}
|
||||
#else
|
||||
if (n_threads > threadpool->n_threads_max) {
|
||||
GGML_PRINT("WARNING: cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
||||
GGML_LOG_WARN("cplan requested more threads (%d) than available (%d)\n", n_threads, threadpool->n_threads_max);
|
||||
n_threads = threadpool->n_threads_max;
|
||||
}
|
||||
|
||||
|
@ -20676,30 +20840,30 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context *
|
|||
}
|
||||
|
||||
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
||||
GGML_PRINT("=== GRAPH ===\n");
|
||||
GGML_LOG_INFO("=== GRAPH ===\n");
|
||||
|
||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||
GGML_LOG_INFO("n_nodes = %d\n", cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
|
||||
GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1], node->ne[2],
|
||||
ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ");
|
||||
}
|
||||
|
||||
GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs);
|
||||
GGML_LOG_INFO("n_leafs = %d\n", cgraph->n_leafs);
|
||||
for (int i = 0; i < cgraph->n_leafs; i++) {
|
||||
struct ggml_tensor * node = cgraph->leafs[i];
|
||||
|
||||
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
||||
GGML_LOG_INFO(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s %16s\n",
|
||||
i,
|
||||
node->ne[0], node->ne[1],
|
||||
ggml_op_name(node->op),
|
||||
ggml_get_name(node));
|
||||
}
|
||||
|
||||
GGML_PRINT("========================================\n");
|
||||
GGML_LOG_INFO("========================================\n");
|
||||
}
|
||||
|
||||
// check if node is part of the graph
|
||||
|
@ -20870,7 +21034,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
|
|||
|
||||
fclose(fp);
|
||||
|
||||
GGML_PRINT("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
|
||||
GGML_LOG_INFO("%s: dot -Tpng %s -o %s.png && open %s.png\n", __func__, filename, filename, filename);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -23365,4 +23529,9 @@ int ggml_cpu_get_sve_cnt(void) {
|
|||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void ggml_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||
g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default;
|
||||
g_logger_state.log_callback_user_data = user_data;
|
||||
}
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue