ggml-backend : add names to buffers
This commit is contained in:
parent
33f0761e9b
commit
6483328fa9
6 changed files with 250 additions and 196 deletions
|
@ -16,6 +16,7 @@ extern "C" {
|
||||||
typedef void * ggml_backend_buffer_type_context_t;
|
typedef void * ggml_backend_buffer_type_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i {
|
struct ggml_backend_buffer_type_i {
|
||||||
|
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
|
@ -34,6 +35,7 @@ extern "C" {
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
|
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||||
|
|
|
@ -15,6 +15,10 @@
|
||||||
|
|
||||||
// backend buffer type
|
// backend buffer type
|
||||||
|
|
||||||
|
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return buft->iface.get_name(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
return buft->iface.alloc_buffer(buft, size);
|
return buft->iface.alloc_buffer(buft, size);
|
||||||
}
|
}
|
||||||
|
@ -64,6 +68,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
|
return buffer->iface.get_name(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
return;
|
return;
|
||||||
|
@ -397,6 +405,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
||||||
|
|
||||||
// backend CPU
|
// backend CPU
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
GGML_UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
return (void *)buffer->context;
|
return (void *)buffer->context;
|
||||||
}
|
}
|
||||||
|
@ -434,6 +448,7 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
|
@ -446,6 +461,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
|
|
||||||
// for buffers from ptr, free is not called
|
// for buffers from ptr, free is not called
|
||||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
||||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL, // no initialization required
|
/* .init_tensor = */ NULL, // no initialization required
|
||||||
|
@ -458,6 +474,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CPU";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||||
|
@ -488,6 +510,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
@ -506,6 +529,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
|
|
||||||
#include <hbwmalloc.h>
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CPU_HBM";
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||||
|
return "CPU_HBM";
|
||||||
|
|
||||||
|
GGML_UNUSED(buf);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
hbw_free(buffer->context);
|
hbw_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
@ -522,14 +557,16 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
||||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
|
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
||||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
@ -795,6 +832,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
||||||
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
||||||
// ie. kv cache updates
|
// ie. kv cache updates
|
||||||
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
||||||
|
|
||||||
// dst
|
// dst
|
||||||
ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
|
ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
|
||||||
if (cur_backend != NULL) {
|
if (cur_backend != NULL) {
|
||||||
|
@ -809,7 +847,6 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
||||||
}
|
}
|
||||||
|
|
||||||
// src
|
// src
|
||||||
int cur_prio = INT_MAX;
|
|
||||||
size_t cur_size = 0;
|
size_t cur_size = 0;
|
||||||
|
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
@ -826,16 +863,12 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
//if (src_backend != NULL) {
|
|
||||||
int src_prio = sched_backend_prio(sched, src_backend);
|
|
||||||
size_t src_size = ggml_nbytes(src);
|
size_t src_size = ggml_nbytes(src);
|
||||||
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
|
if (src_size >= cur_size) {
|
||||||
cur_prio = src_prio;
|
|
||||||
cur_size = src_size;
|
cur_size = src_size;
|
||||||
cur_backend = src_backend;
|
cur_backend = src_backend;
|
||||||
SET_CAUSE(node, "1.src%d", i);
|
SET_CAUSE(node, "1.src%d", i);
|
||||||
}
|
}
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
return cur_backend;
|
return cur_backend;
|
||||||
}
|
}
|
||||||
|
@ -946,39 +979,6 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
}
|
}
|
||||||
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
#if 0
|
|
||||||
// pass 2: assign backends to ops from current assignments
|
|
||||||
// TODO:
|
|
||||||
// - reuse sched_backend_from_cur
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
|
||||||
if (node_allocr == NULL) {
|
|
||||||
int cur_prio = INT_MAX;
|
|
||||||
size_t cur_size = 0;
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * src = node->src[j];
|
|
||||||
if (src == NULL) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
|
||||||
if (src_allocr != NULL) {
|
|
||||||
int src_prio = sched_allocr_prio(sched, src_allocr);
|
|
||||||
size_t src_size = ggml_nbytes(src);
|
|
||||||
if (src_prio < cur_prio && src_size >= cur_size) {
|
|
||||||
cur_prio = src_prio;
|
|
||||||
cur_size = src_size;
|
|
||||||
node_allocr = src_allocr;
|
|
||||||
SET_CAUSE(node, "2.src%d", j);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (node_allocr != NULL) {
|
|
||||||
node_allocr(node) = node_allocr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
// pass 2: assign backends to ops from current assignments
|
// pass 2: assign backends to ops from current assignments
|
||||||
// start from the end and assign the same backend to previous ops
|
// start from the end and assign the same backend to previous ops
|
||||||
{
|
{
|
||||||
|
@ -994,7 +994,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
||||||
|
@ -1015,9 +1015,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
// TODO:
|
{
|
||||||
// - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
|
|
||||||
// find first backend
|
|
||||||
int cur_split = 0;
|
int cur_split = 0;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
@ -1091,10 +1089,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
}
|
}
|
||||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
sched->splits[cur_split].i_end = graph->n_nodes;
|
||||||
sched->n_splits = cur_split + 1;
|
sched->n_splits = cur_split + 1;
|
||||||
|
}
|
||||||
|
|
||||||
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
||||||
|
|
||||||
#if 1
|
#ifndef NDEBUG
|
||||||
// sanity check: all sources should have the same backend as the node
|
// sanity check: all sources should have the same backend as the node
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
@ -1269,10 +1268,13 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
||||||
sched_reset(sched);
|
sched_reset(sched);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||||
|
|
||||||
sched_split_graph(sched, graph);
|
sched_split_graph(sched, graph);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
|
ggml_backend_sched_graph_split(sched, graph);
|
||||||
sched_alloc_splits(sched);
|
sched_alloc_splits(sched);
|
||||||
sched_compute_splits(sched);
|
sched_compute_splits(sched);
|
||||||
sched_reset(sched);
|
sched_reset(sched);
|
||||||
|
|
|
@ -17,6 +17,7 @@ extern "C" {
|
||||||
//
|
//
|
||||||
|
|
||||||
// buffer type
|
// buffer type
|
||||||
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
|
@ -29,6 +30,7 @@ extern "C" {
|
||||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
|
@ -148,11 +150,10 @@ extern "C" {
|
||||||
|
|
||||||
// Initialize a backend scheduler
|
// Initialize a backend scheduler
|
||||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||||
|
// Get the number of splits of the last graph
|
||||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||||
|
@ -160,11 +161,15 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||||
|
|
||||||
// Allocate a graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API void ggml_backend_sched_graph_compute(
|
GGML_API void ggml_backend_sched_graph_compute(
|
||||||
ggml_backend_sched_t sched,
|
ggml_backend_sched_t sched,
|
||||||
struct ggml_cgraph * graph);
|
struct ggml_cgraph * graph);
|
||||||
|
|
||||||
|
// Split without computing - only useful to find the number of splits
|
||||||
|
GGML_API void ggml_backend_sched_graph_split(
|
||||||
|
ggml_backend_sched_t sched,
|
||||||
|
struct ggml_cgraph * graph);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Utils
|
// Utils
|
||||||
|
|
72
ggml-cuda.cu
72
ggml-cuda.cu
|
@ -8,6 +8,7 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,6 +78,7 @@
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
#define cudaMemset hipMemset
|
#define cudaMemset hipMemset
|
||||||
#define cudaMemsetAsync hipMemsetAsync
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
|
#define cudaMemGetInfo hipMemGetInfo
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
#define cudaSetDevice hipSetDevice
|
#define cudaSetDevice hipSetDevice
|
||||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
@ -6885,6 +6887,8 @@ void ggml_init_cublas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: cleanup this after the split buffer type is implemented
|
||||||
|
#if 0
|
||||||
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
||||||
if (tensor_split == nullptr) {
|
if (tensor_split == nullptr) {
|
||||||
return;
|
return;
|
||||||
|
@ -6908,6 +6912,7 @@ void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
||||||
g_tensor_split[i] /= split_sum;
|
g_tensor_split[i] /= split_sum;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
void * ggml_cuda_host_malloc(size_t size) {
|
void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
|
@ -9349,10 +9354,9 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
||||||
|
|
||||||
struct ggml_backend_context_cuda {
|
struct ggml_backend_context_cuda {
|
||||||
int device;
|
int device;
|
||||||
char name[128];
|
std::string name;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// cuda buffer
|
// cuda buffer
|
||||||
|
|
||||||
struct ggml_backend_buffer_context_cuda {
|
struct ggml_backend_buffer_context_cuda {
|
||||||
|
@ -9360,8 +9364,12 @@ struct ggml_backend_buffer_context_cuda {
|
||||||
void * dev_ptr = nullptr;
|
void * dev_ptr = nullptr;
|
||||||
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
||||||
size_t temp_tensor_extra_index = 0;
|
size_t temp_tensor_extra_index = 0;
|
||||||
|
std::string name;
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) :
|
||||||
|
device(device), dev_ptr(dev_ptr),
|
||||||
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||||
|
}
|
||||||
|
|
||||||
~ggml_backend_buffer_context_cuda() {
|
~ggml_backend_buffer_context_cuda() {
|
||||||
delete[] temp_tensor_extras;
|
delete[] temp_tensor_extras;
|
||||||
|
@ -9381,6 +9389,11 @@ struct ggml_backend_buffer_context_cuda {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||||
|
@ -9457,6 +9470,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||||
|
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
||||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||||
|
@ -9469,21 +9483,32 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||||
|
|
||||||
// cuda buffer type
|
// cuda buffer type
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
struct ggml_backend_cuda_buffer_type_context {
|
||||||
int device = (int) (intptr_t) buft->context;
|
int device;
|
||||||
|
std::string name;
|
||||||
|
};
|
||||||
|
|
||||||
ggml_cuda_set_device(device);
|
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
|
return ctx->name.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
|
|
||||||
|
ggml_cuda_set_device(buft_ctx->device);
|
||||||
|
|
||||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||||
|
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, device, cudaGetErrorString(err));
|
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
||||||
}
|
}
|
||||||
|
@ -9519,13 +9544,14 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int device = (int) (intptr_t) buft->context;
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
|
|
||||||
return device == cuda_ctx->device;
|
return buft_ctx->device == cuda_ctx->device;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
|
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||||
|
@ -9542,7 +9568,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
||||||
ggml_backend_cuda_buffer_types[i] = {
|
ggml_backend_cuda_buffer_types[i] = {
|
||||||
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
||||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
ggml_backend_cuda_buffer_type_initialized = true;
|
ggml_backend_cuda_buffer_type_initialized = true;
|
||||||
|
@ -9553,6 +9579,18 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
|
|
||||||
// host buffer type
|
// host buffer type
|
||||||
|
|
||||||
|
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "CUDA_Host";
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
|
return "CUDA_Host";
|
||||||
|
|
||||||
|
UNUSED(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_cuda_host_free(buffer->context);
|
ggml_cuda_host_free(buffer->context);
|
||||||
}
|
}
|
||||||
|
@ -9568,6 +9606,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
buffer->buft = buft;
|
buffer->buft = buft;
|
||||||
|
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
||||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||||
|
|
||||||
return buffer;
|
return buffer;
|
||||||
|
@ -9576,6 +9615,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
||||||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
|
@ -9591,12 +9631,9 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
// backend
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||||
//return GGML_CUDA_NAME;
|
|
||||||
//UNUSED(backend);
|
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||||
// TODO: on init
|
|
||||||
sprintf(cuda_ctx->name, "%s%d", "CUD", cuda_ctx->device);
|
return cuda_ctx->name.c_str();
|
||||||
return cuda_ctx->name;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||||
|
@ -9861,7 +9898,8 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||||
ggml_cuda_set_main_device(device);
|
ggml_cuda_set_main_device(device);
|
||||||
|
|
||||||
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
||||||
/* .device = */ device
|
/* .device = */ device,
|
||||||
|
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_t cuda_backend = new ggml_backend {
|
ggml_backend_t cuda_backend = new ggml_backend {
|
||||||
|
|
20
ggml-metal.m
20
ggml-metal.m
|
@ -2442,10 +2442,10 @@ static void ggml_backend_metal_free_device(void) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
return "Metal";
|
||||||
|
|
||||||
return ctx->all_data;
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
@ -2463,6 +2463,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
return ctx->all_data;
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
|
@ -2494,6 +2500,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
|
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
|
@ -2506,6 +2513,12 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
|
|
||||||
// default buffer type
|
// default buffer type
|
||||||
|
|
||||||
|
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||||
|
return "Metal";
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
@ -2578,6 +2591,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
|
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
|
27
llama.cpp
27
llama.cpp
|
@ -1731,7 +1731,7 @@ static bool llama_kv_cache_init(
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_clear(buf, 0);
|
ggml_backend_buffer_clear(buf, 0);
|
||||||
// FIXME: buffer type name
|
// FIXME: buffer type name
|
||||||
LLAMA_LOG_INFO("%s: KV %10s buffer size: %.02f MiB\n", __func__, "???", ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: %10s KV buffer size = %7.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||||
cache.bufs.push_back(buf);
|
cache.bufs.push_back(buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3169,12 +3169,13 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
if (split_mode == CUDA_SPLIT_LAYER) {
|
if (split_mode == CUDA_SPLIT_LAYER) {
|
||||||
|
// calculate the split points
|
||||||
int device_count = ggml_backend_cuda_get_device_count();
|
int device_count = ggml_backend_cuda_get_device_count();
|
||||||
float splits[GGML_CUDA_MAX_DEVICES];
|
float splits[GGML_CUDA_MAX_DEVICES];
|
||||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
std::copy(tensor_split, tensor_split + device_count, splits);
|
||||||
bool all_zero = std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; });
|
bool all_zero = std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; });
|
||||||
if (all_zero) {
|
if (all_zero) {
|
||||||
// set by free memory
|
// default split, by free memory
|
||||||
for (int i = 0; i < device_count; ++i) {
|
for (int i = 0; i < device_count; ++i) {
|
||||||
size_t total;
|
size_t total;
|
||||||
size_t free;
|
size_t free;
|
||||||
|
@ -3189,21 +3190,17 @@ static bool llm_load_tensors(
|
||||||
}
|
}
|
||||||
for (int i = 0; i < device_count; ++i) {
|
for (int i = 0; i < device_count; ++i) {
|
||||||
splits[i] /= split_sum;
|
splits[i] /= split_sum;
|
||||||
printf("split[%d] = %.2f\n", i, splits[i]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// assign GPU layers according to the splits to the devices
|
||||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
|
|
||||||
// assign layers proportionally, in reverse order
|
|
||||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
||||||
printf("layer %d -> gpu %d\n", (int)i, layer_gpu);
|
|
||||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
||||||
}
|
}
|
||||||
// output layer
|
// output layer
|
||||||
if (n_gpu_layers > n_layer) {
|
if (n_gpu_layers > n_layer) {
|
||||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits;
|
int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits;
|
||||||
printf("output -> gpu %d\n", layer_gpu);
|
|
||||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
||||||
} else {
|
} else {
|
||||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||||
|
@ -3764,14 +3761,7 @@ static bool llm_load_tensors(
|
||||||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
|
|
||||||
for (ggml_backend_buffer_t buf : model.bufs) {
|
for (ggml_backend_buffer_t buf : model.bufs) {
|
||||||
// FIXME: add buffer type names to ggml-backend
|
LLAMA_LOG_INFO("%s: %10s buffer size = %7.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||||
const char * name;
|
|
||||||
if (ggml_backend_buffer_type(buf) == ggml_backend_cpu_buffer_type()) {
|
|
||||||
name = "CPU";
|
|
||||||
} else {
|
|
||||||
name = "???";
|
|
||||||
}
|
|
||||||
LLAMA_LOG_INFO("%s: %10s buffer size = %7.2f MiB\n", __func__, name, ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6224,6 +6214,8 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||||
|
|
||||||
|
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||||
#endif
|
#endif
|
||||||
|
@ -9269,13 +9261,14 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
// initialize scheduler with the worst-case graph
|
// initialize scheduler with the worst-case graph
|
||||||
ggml_backend_sched_init_measure(ctx->sched, gf);
|
ggml_backend_sched_init_measure(ctx->sched, gf);
|
||||||
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||||
|
|
||||||
for (ggml_backend_t backend : backends) {
|
for (ggml_backend_t backend : backends) {
|
||||||
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %7.2f MiB\n", __func__,
|
||||||
ggml_backend_name(backend),
|
ggml_backend_name(backend),
|
||||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue