ggml-backend : add names to buffers
This commit is contained in:
parent
33f0761e9b
commit
6483328fa9
6 changed files with 250 additions and 196 deletions
|
@ -16,6 +16,7 @@ extern "C" {
|
|||
typedef void * ggml_backend_buffer_type_context_t;
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
|
@ -34,16 +35,17 @@ extern "C" {
|
|||
typedef void * ggml_backend_buffer_context_t;
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
|
|
254
ggml-backend.c
254
ggml-backend.c
|
@ -15,6 +15,10 @@
|
|||
|
||||
// backend buffer type
|
||||
|
||||
const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name(buft);
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
@ -64,6 +68,10 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
|||
return buffer;
|
||||
}
|
||||
|
||||
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return buffer->iface.get_name(buffer);
|
||||
}
|
||||
|
||||
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
|
||||
if (buffer == NULL) {
|
||||
return;
|
||||
|
@ -397,6 +405,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
|
|||
|
||||
// backend CPU
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return (void *)buffer->context;
|
||||
}
|
||||
|
@ -434,6 +448,7 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||
}
|
||||
|
||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
|
@ -446,6 +461,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|||
|
||||
// for buffers from ptr, free is not called
|
||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_name,
|
||||
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
|
||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||
/* .init_tensor = */ NULL, // no initialization required
|
||||
|
@ -458,6 +474,12 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|||
|
||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
|
||||
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
|
||||
|
@ -488,6 +510,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft
|
|||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
|
@ -506,6 +529,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
|||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buf);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
@ -522,14 +557,16 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_
|
|||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
|
@ -795,6 +832,7 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|||
// if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
|
||||
// ie. kv cache updates
|
||||
// note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
|
||||
|
||||
// dst
|
||||
ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
|
||||
if (cur_backend != NULL) {
|
||||
|
@ -809,7 +847,6 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|||
}
|
||||
|
||||
// src
|
||||
int cur_prio = INT_MAX;
|
||||
size_t cur_size = 0;
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
|
@ -826,16 +863,12 @@ static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct
|
|||
break;
|
||||
}
|
||||
|
||||
//if (src_backend != NULL) {
|
||||
int src_prio = sched_backend_prio(sched, src_backend);
|
||||
size_t src_size = ggml_nbytes(src);
|
||||
if (/*src_prio < cur_prio &&*/ src_size >= cur_size) {
|
||||
cur_prio = src_prio;
|
||||
cur_size = src_size;
|
||||
cur_backend = src_backend;
|
||||
SET_CAUSE(node, "1.src%d", i);
|
||||
}
|
||||
//}
|
||||
size_t src_size = ggml_nbytes(src);
|
||||
if (src_size >= cur_size) {
|
||||
cur_size = src_size;
|
||||
cur_backend = src_backend;
|
||||
SET_CAUSE(node, "1.src%d", i);
|
||||
}
|
||||
}
|
||||
return cur_backend;
|
||||
}
|
||||
|
@ -946,55 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
}
|
||||
//printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
|
||||
#if 0
|
||||
// pass 2: assign backends to ops from current assignments
|
||||
// TODO:
|
||||
// - reuse sched_backend_from_cur
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
if (node_allocr == NULL) {
|
||||
int cur_prio = INT_MAX;
|
||||
size_t cur_size = 0;
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
break;
|
||||
}
|
||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||
if (src_allocr != NULL) {
|
||||
int src_prio = sched_allocr_prio(sched, src_allocr);
|
||||
size_t src_size = ggml_nbytes(src);
|
||||
if (src_prio < cur_prio && src_size >= cur_size) {
|
||||
cur_prio = src_prio;
|
||||
cur_size = src_size;
|
||||
node_allocr = src_allocr;
|
||||
SET_CAUSE(node, "2.src%d", j);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (node_allocr != NULL) {
|
||||
node_allocr(node) = node_allocr;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// pass 2: assign backends to ops from current assignments
|
||||
// start from the end and assign the same backend to previous ops
|
||||
{
|
||||
ggml_tallocr_t cur_allocr = NULL;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
if (node_allocr != NULL) {
|
||||
cur_allocr = node_allocr;
|
||||
} else {
|
||||
node_allocr(node) = cur_allocr;
|
||||
SET_CAUSE(node, "2.cur");
|
||||
ggml_tallocr_t cur_allocr = NULL;
|
||||
for (int i = graph->n_nodes - 1; i >= 0; i--) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
if (node_allocr != NULL) {
|
||||
cur_allocr = node_allocr;
|
||||
} else {
|
||||
node_allocr(node) = cur_allocr;
|
||||
SET_CAUSE(node, "2.cur");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
|
||||
// pass 3: assign backends to remaining src from dst (should only be leafs)
|
||||
|
@ -1015,86 +1015,85 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
//printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
|
||||
|
||||
// pass 4: split graph, find tensors that need to be copied
|
||||
// TODO:
|
||||
// - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
|
||||
// find first backend
|
||||
int cur_split = 0;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (node->view_src == NULL) {
|
||||
sched->splits[0].tallocr = node_allocr(node);
|
||||
break;
|
||||
}
|
||||
}
|
||||
sched->splits[0].i_start = 0;
|
||||
sched->splits[0].n_inputs = 0;
|
||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
||||
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
||||
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
|
||||
if (node_allocr != cur_allocr) {
|
||||
sched->splits[cur_split].i_end = i;
|
||||
cur_split++;
|
||||
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
||||
sched->splits[cur_split].tallocr = node_allocr;
|
||||
sched->splits[cur_split].i_start = i;
|
||||
sched->splits[cur_split].n_inputs = 0;
|
||||
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
||||
cur_allocr = node_allocr;
|
||||
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||
}
|
||||
|
||||
// find inputs that are not on the same backend
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
{
|
||||
int cur_split = 0;
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
if (node->view_src == NULL) {
|
||||
sched->splits[0].tallocr = node_allocr(node);
|
||||
break;
|
||||
}
|
||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||
if (src_allocr != node_allocr) {
|
||||
// check if the input is already in the split
|
||||
bool found = false;
|
||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
||||
if (sched->splits[cur_split].inputs[k] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
sched->splits[0].i_start = 0;
|
||||
sched->splits[0].n_inputs = 0;
|
||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
||||
ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
|
||||
size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
||||
if (ggml_is_view_op(node->op)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||
|
||||
if (node_allocr != cur_allocr) {
|
||||
sched->splits[cur_split].i_end = i;
|
||||
cur_split++;
|
||||
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
|
||||
sched->splits[cur_split].tallocr = node_allocr;
|
||||
sched->splits[cur_split].i_start = i;
|
||||
sched->splits[cur_split].n_inputs = 0;
|
||||
memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
|
||||
cur_allocr = node_allocr;
|
||||
cur_backend_id = sched_allocr_prio(sched, cur_allocr);
|
||||
}
|
||||
|
||||
// find inputs that are not on the same backend
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (src == NULL) {
|
||||
break;
|
||||
}
|
||||
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||
if (src_allocr != node_allocr) {
|
||||
// check if the input is already in the split
|
||||
bool found = false;
|
||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
||||
if (sched->splits[cur_split].inputs[k] == src) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
||||
}
|
||||
if (!found) {
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
//printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
|
||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
|
||||
}
|
||||
|
||||
// create copies
|
||||
size_t id = hash_id(src);
|
||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||
node_allocr(tensor_copy) = cur_allocr;
|
||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||
// create copies
|
||||
size_t id = hash_id(src);
|
||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||
node_allocr(tensor_copy) = cur_allocr;
|
||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||
}
|
||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||
}
|
||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||
}
|
||||
}
|
||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
||||
sched->n_splits = cur_split + 1;
|
||||
}
|
||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
||||
sched->n_splits = cur_split + 1;
|
||||
|
||||
//fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
|
||||
|
||||
#if 1
|
||||
#ifndef NDEBUG
|
||||
// sanity check: all sources should have the same backend as the node
|
||||
for (int i = 0; i < graph->n_nodes; i++) {
|
||||
struct ggml_tensor * node = graph->nodes[i];
|
||||
|
@ -1269,10 +1268,13 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|||
sched_reset(sched);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||
|
||||
sched_split_graph(sched, graph);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
ggml_backend_sched_graph_split(sched, graph);
|
||||
sched_alloc_splits(sched);
|
||||
sched_compute_splits(sched);
|
||||
sched_reset(sched);
|
||||
|
|
|
@ -17,11 +17,12 @@ extern "C" {
|
|||
//
|
||||
|
||||
// buffer type
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||
|
||||
// buffer
|
||||
enum ggml_backend_buffer_usage {
|
||||
|
@ -29,16 +30,17 @@ extern "C" {
|
|||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||
};
|
||||
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type (ggml_backend_buffer_t buffer);
|
||||
|
||||
|
||||
//
|
||||
|
@ -147,24 +149,27 @@ extern "C" {
|
|||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Initialize a backend scheduler
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
||||
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
// Get the number of splits of the last graph
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
|
||||
// Allocate a graph on the backend scheduler
|
||||
// Allocate and compute graph on the backend scheduler
|
||||
GGML_API void ggml_backend_sched_graph_compute(
|
||||
ggml_backend_sched_t sched,
|
||||
struct ggml_cgraph * graph);
|
||||
|
||||
// Split without computing - only useful to find the number of splits
|
||||
GGML_API void ggml_backend_sched_graph_split(
|
||||
ggml_backend_sched_t sched,
|
||||
struct ggml_cgraph * graph);
|
||||
|
||||
//
|
||||
// Utils
|
||||
|
|
72
ggml-cuda.cu
72
ggml-cuda.cu
|
@ -8,6 +8,7 @@
|
|||
#include <limits>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
||||
|
@ -77,6 +78,7 @@
|
|||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaMemsetAsync hipMemsetAsync
|
||||
#define cudaMemGetInfo hipMemGetInfo
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
|
@ -6885,6 +6887,8 @@ void ggml_init_cublas() {
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: cleanup this after the split buffer type is implemented
|
||||
#if 0
|
||||
void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
||||
if (tensor_split == nullptr) {
|
||||
return;
|
||||
|
@ -6908,6 +6912,7 @@ void ggml_cuda_set_tensor_split(const float * tensor_split) {
|
|||
g_tensor_split[i] /= split_sum;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void * ggml_cuda_host_malloc(size_t size) {
|
||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||
|
@ -9349,10 +9354,9 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
|||
|
||||
struct ggml_backend_context_cuda {
|
||||
int device;
|
||||
char name[128];
|
||||
std::string name;
|
||||
};
|
||||
|
||||
|
||||
// cuda buffer
|
||||
|
||||
struct ggml_backend_buffer_context_cuda {
|
||||
|
@ -9360,8 +9364,12 @@ struct ggml_backend_buffer_context_cuda {
|
|||
void * dev_ptr = nullptr;
|
||||
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
||||
size_t temp_tensor_extra_index = 0;
|
||||
std::string name;
|
||||
|
||||
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
|
||||
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) :
|
||||
device(device), dev_ptr(dev_ptr),
|
||||
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||
}
|
||||
|
||||
~ggml_backend_buffer_context_cuda() {
|
||||
delete[] temp_tensor_extras;
|
||||
|
@ -9381,6 +9389,11 @@ struct ggml_backend_buffer_context_cuda {
|
|||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||
|
@ -9457,6 +9470,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
|
|||
}
|
||||
|
||||
static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
|
||||
|
@ -9469,21 +9483,32 @@ static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
|
|||
|
||||
// cuda buffer type
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
int device = (int) (intptr_t) buft->context;
|
||||
struct ggml_backend_cuda_buffer_type_context {
|
||||
int device;
|
||||
std::string name;
|
||||
};
|
||||
|
||||
ggml_cuda_set_device(device);
|
||||
static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
return ctx->name.c_str();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
|
||||
ggml_cuda_set_device(buft_ctx->device);
|
||||
|
||||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, device, cudaGetErrorString(err));
|
||||
fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
|
||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
|
||||
|
||||
return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
|
||||
}
|
||||
|
@ -9519,13 +9544,14 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
|||
return false;
|
||||
}
|
||||
|
||||
int device = (int) (intptr_t) buft->context;
|
||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
|
||||
return device == cuda_ctx->device;
|
||||
return buft_ctx->device == cuda_ctx->device;
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
|
||||
|
@ -9542,7 +9568,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|||
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
|
||||
ggml_backend_cuda_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
|
||||
/* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
|
||||
/* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
|
||||
};
|
||||
}
|
||||
ggml_backend_cuda_buffer_type_initialized = true;
|
||||
|
@ -9553,6 +9579,18 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
|||
|
||||
// host buffer type
|
||||
|
||||
static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CUDA_Host";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||
return "CUDA_Host";
|
||||
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
ggml_cuda_host_free(buffer->context);
|
||||
}
|
||||
|
@ -9568,6 +9606,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
|
||||
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
|
@ -9576,6 +9615,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
|
|||
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||
|
@ -9591,12 +9631,9 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
|||
// backend
|
||||
|
||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||
//return GGML_CUDA_NAME;
|
||||
//UNUSED(backend);
|
||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
||||
// TODO: on init
|
||||
sprintf(cuda_ctx->name, "%s%d", "CUD", cuda_ctx->device);
|
||||
return cuda_ctx->name;
|
||||
|
||||
return cuda_ctx->name.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||
|
@ -9861,7 +9898,8 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
|||
ggml_cuda_set_main_device(device);
|
||||
|
||||
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
||||
/* .device = */ device
|
||||
/* .device = */ device,
|
||||
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
||||
};
|
||||
|
||||
ggml_backend_t cuda_backend = new ggml_backend {
|
||||
|
|
20
ggml-metal.m
20
ggml-metal.m
|
@ -2442,10 +2442,10 @@ static void ggml_backend_metal_free_device(void) {
|
|||
}
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||
static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||
return "Metal";
|
||||
|
||||
return ctx->all_data;
|
||||
UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
|
@ -2463,6 +2463,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
|
|||
free(ctx);
|
||||
}
|
||||
|
||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||
|
||||
return ctx->all_data;
|
||||
}
|
||||
|
||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
memcpy((char *)tensor->data + offset, data, size);
|
||||
|
||||
|
@ -2494,6 +2500,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
|
|||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||
/* .init_tensor = */ NULL,
|
||||
|
@ -2506,6 +2513,12 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
|||
|
||||
// default buffer type
|
||||
|
||||
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "Metal";
|
||||
|
||||
UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||
|
||||
|
@ -2578,6 +2591,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu
|
|||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
|
|
29
llama.cpp
29
llama.cpp
|
@ -1731,7 +1731,7 @@ static bool llama_kv_cache_init(
|
|||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
// FIXME: buffer type name
|
||||
LLAMA_LOG_INFO("%s: KV %10s buffer size: %.02f MiB\n", __func__, "???", ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: %10s KV buffer size = %7.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
||||
cache.bufs.push_back(buf);
|
||||
}
|
||||
|
||||
|
@ -3169,12 +3169,13 @@ static bool llm_load_tensors(
|
|||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
if (split_mode == CUDA_SPLIT_LAYER) {
|
||||
// calculate the split points
|
||||
int device_count = ggml_backend_cuda_get_device_count();
|
||||
float splits[GGML_CUDA_MAX_DEVICES];
|
||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
||||
bool all_zero = std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; });
|
||||
if (all_zero) {
|
||||
// set by free memory
|
||||
// default split, by free memory
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
size_t total;
|
||||
size_t free;
|
||||
|
@ -3189,21 +3190,17 @@ static bool llm_load_tensors(
|
|||
}
|
||||
for (int i = 0; i < device_count; ++i) {
|
||||
splits[i] /= split_sum;
|
||||
printf("split[%d] = %.2f\n", i, splits[i]);
|
||||
}
|
||||
|
||||
// assign GPU layers according to the splits to the devices
|
||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
||||
|
||||
// assign layers proportionally, in reverse order
|
||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
||||
printf("layer %d -> gpu %d\n", (int)i, layer_gpu);
|
||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
||||
}
|
||||
// output layer
|
||||
if (n_gpu_layers > n_layer) {
|
||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits;
|
||||
printf("output -> gpu %d\n", layer_gpu);
|
||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
||||
} else {
|
||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
||||
|
@ -3250,7 +3247,7 @@ static bool llm_load_tensors(
|
|||
model.ctxs.push_back(ctx);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
|
||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
|
||||
|
||||
// create tensors for the weights
|
||||
{
|
||||
|
@ -3764,14 +3761,7 @@ static bool llm_load_tensors(
|
|||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||
|
||||
for (ggml_backend_buffer_t buf : model.bufs) {
|
||||
// FIXME: add buffer type names to ggml-backend
|
||||
const char * name;
|
||||
if (ggml_backend_buffer_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||
name = "CPU";
|
||||
} else {
|
||||
name = "???";
|
||||
}
|
||||
LLAMA_LOG_INFO("%s: %10s buffer size = %7.2f MiB\n", __func__, name, ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
LLAMA_LOG_INFO("%s: %10s buffer size = %7.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6224,6 +6214,8 @@ static int llama_decode_internal(
|
|||
}
|
||||
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
||||
|
||||
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
||||
#endif
|
||||
|
@ -9269,13 +9261,14 @@ struct llama_context * llama_new_context_with_model(
|
|||
|
||||
// initialize scheduler with the worst-case graph
|
||||
ggml_backend_sched_init_measure(ctx->sched, gf);
|
||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
||||
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
for (ggml_backend_t backend : backends) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %.2f MiB\n", __func__,
|
||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %7.2f MiB\n", __func__,
|
||||
ggml_backend_name(backend),
|
||||
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue