metal : adapting to ggml_backend (WIP)

This commit is contained in:
Georgi Gerganov 2023-07-18 16:54:41 +03:00
parent 1102ff56db
commit 0a3861c47b
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 99 additions and 41 deletions

View file

@ -19,51 +19,56 @@
#pragma once
#include "ggml.h"
#include <stddef.h>
#include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 16
struct ggml_tensor;
struct ggml_cgraph;
//struct ggml_tensor;
//struct ggml_cgraph;
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_metal_context;
// GG: maybe return ptr and avoid the "ggml.h" include
struct ggml_backend ggml_backend_metal_init();
// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
// set the number of command buffers to use
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
// creates a mapping between a host memory buffer and a device memory buffer
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
// - the mapping is used during computation to determine the arguments of the compute kernels
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
// - max_size specifies the maximum size of a tensor and is used to create shared views such
// that it is guaranteed that the tensor will fit in at least one of the views
//struct ggml_metal_context;
//
bool ggml_metal_add_buffer(
struct ggml_metal_context * ctx,
const char * name,
void * data,
size_t size,
size_t max_size);
// set data from host memory into the device
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// get data from the device into host memory
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
//// number of command buffers to use
//struct ggml_metal_context * ggml_metal_init(int n_cb);
//void ggml_metal_free(struct ggml_metal_context * ctx);
//
//// set the number of command buffers to use
//void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
//
//// creates a mapping between a host memory buffer and a device memory buffer
//// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
//// - the mapping is used during computation to determine the arguments of the compute kernels
//// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
//// - max_size specifies the maximum size of a tensor and is used to create shared views such
//// that it is guaranteed that the tensor will fit in at least one of the views
////
//bool ggml_metal_add_buffer(
// struct ggml_metal_context * ctx,
// const char * name,
// void * data,
// size_t size,
// size_t max_size);
//
//// set data from host memory into the device
//void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// get data from the device into host memory
//void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
//
//// same as ggml_graph_compute but uses Metal
//// creates gf->n_threads command buffers in parallel
//void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
#ifdef __cplusplus
}