sched : add a new split if the current one has too many inputs
reduce max inputs per split more cleanup
This commit is contained in:
parent
980907595f
commit
0661e6a1ae
5 changed files with 81 additions and 70 deletions
|
@ -114,10 +114,10 @@ static std::string get_cpu_info() {
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::string id;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
int count = ggml_cuda_get_device_count();
|
int count = ggml_backend_cuda_get_device_count();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char buf[128];
|
char buf[128];
|
||||||
ggml_cuda_get_device_description(i, buf, sizeof(buf));
|
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
||||||
id += buf;
|
id += buf;
|
||||||
if (i < count - 1) {
|
if (i < count - 1) {
|
||||||
id += "/";
|
id += "/";
|
||||||
|
|
|
@ -768,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
|
if (cpu_plan->cplan.work_data == NULL) {
|
||||||
|
free(cpu_plan);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
@ -1007,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLITS
|
#ifndef GGML_SCHED_MAX_SPLITS
|
||||||
#define GGML_SCHED_MAX_SPLITS 1024
|
#define GGML_SCHED_MAX_SPLITS 2048
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_COPIES
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
@ -1422,31 +1426,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
// check if a weight is on a different backend and start a new split if so
|
// check if we should start a new split based on the sources of the current node
|
||||||
// by starting a new split, the memory of the previously offloaded weights can be reused
|
bool need_new_split = false;
|
||||||
bool offload = false;
|
|
||||||
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
// check if a weight is on a different backend
|
||||||
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int src_backend_id = tensor_backend_id(src);
|
||||||
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||||
offload = true;
|
need_new_split = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check if the split has too many inputs
|
||||||
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||||
|
const size_t id = hash_id(src);
|
||||||
|
int src_backend_id = sched->tensor_backend_id[id];
|
||||||
|
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||||
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||||
|
need_new_split = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (node_backend_id != cur_backend_id || offload) {
|
if (node_backend_id != cur_backend_id || need_new_split) {
|
||||||
split->i_end = i;
|
split->i_end = i;
|
||||||
i_split++;
|
i_split++;
|
||||||
if (i_split >= sched->splits_capacity) {
|
if (i_split >= sched->splits_capacity) {
|
||||||
sched->splits_capacity *= 2;
|
sched->splits_capacity *= 2;
|
||||||
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
||||||
|
GGML_ASSERT(sched->splits != NULL);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||||
split = &sched->splits[i_split];
|
split = &sched->splits[i_split];
|
||||||
|
@ -1523,13 +1539,15 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||||
|
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
const size_t input_id = hash_id(input);
|
const size_t input_id = hash_id(input);
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||||
|
@ -1546,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = split->i_start; j < split->i_end; j++) {
|
for (int j = split->i_start; j < split->i_end; j++) {
|
||||||
|
assert(graph_copy->size > graph_copy->n_nodes);
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||||
}
|
}
|
||||||
|
@ -1630,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_copy(input, input_cpy);
|
ggml_backend_tensor_copy(input, input_cpy);
|
||||||
} else {
|
} else {
|
||||||
|
// wait for the split backend to finish using the input before overwriting it
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
ggml_backend_synchronize(input_backend);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1709,8 +1727,10 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
|
||||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
||||||
|
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
|
@ -1770,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
// TODO: extract this to a separate function
|
// TODO: extract this to a separate function
|
||||||
|
|
33
ggml-cuda.cu
33
ggml-cuda.cu
|
@ -7791,11 +7791,11 @@ struct cuda_pool_alloc {
|
||||||
|
|
||||||
static bool g_cublas_loaded = false;
|
static bool g_cublas_loaded = false;
|
||||||
|
|
||||||
GGML_CALL bool ggml_cublas_loaded(void) {
|
static bool ggml_cublas_loaded(void) {
|
||||||
return g_cublas_loaded;
|
return g_cublas_loaded;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_init_cublas() {
|
static void ggml_init_cublas() {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
@ -7884,7 +7884,7 @@ GGML_CALL void ggml_init_cublas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
static void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -7902,7 +7902,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
static void ggml_cuda_host_free(void * ptr) {
|
||||||
CUDA_CHECK(cudaFreeHost(ptr));
|
CUDA_CHECK(cudaFreeHost(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9569,21 +9569,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
|
||||||
if (!g_cublas_loaded) return false;
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
const int64_t ne1 = dst->ne[1];
|
|
||||||
|
|
||||||
// TODO: find the optimal values for these
|
|
||||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
|
||||||
src1->type == GGML_TYPE_F32 &&
|
|
||||||
dst->type == GGML_TYPE_F32 &&
|
|
||||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
@ -10336,7 +10321,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
||||||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
if (main_device >= g_device_count) {
|
if (main_device >= g_device_count) {
|
||||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||||
main_device, g_device_count, g_main_device);
|
main_device, g_device_count, g_main_device);
|
||||||
|
@ -10351,7 +10336,7 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
|
static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
|
@ -10505,7 +10490,7 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL int ggml_cuda_get_device_count() {
|
static int ggml_cuda_get_device_count() {
|
||||||
int device_count;
|
int device_count;
|
||||||
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -10513,7 +10498,7 @@ GGML_CALL int ggml_cuda_get_device_count() {
|
||||||
return device_count;
|
return device_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||||
snprintf(description, description_size, "%s", prop.name);
|
snprintf(description, description_size, "%s", prop.name);
|
||||||
|
@ -11397,6 +11382,8 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
return op->ne[1] > min_batch_size && op->op != GGML_OP_GET_ROWS;
|
return op->ne[1] > min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||||
|
|
|
@ -17,18 +17,17 @@ extern "C" {
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// TODO: remove this
|
|
||||||
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
|
||||||
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// device buffer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
|
|
31
llama.cpp
31
llama.cpp
|
@ -8612,16 +8612,18 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
|
|
||||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||||
// to fix this, we assign the norm layer manually to the backend of its layer
|
// FIXME: fix in ggml_backend_sched
|
||||||
// FIXME: interferes with auto offloading of large batches
|
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||||
//if (il != -1 && strcmp(name, "norm") == 0) {
|
if (batch.n_tokens <= 32 || full_offload) {
|
||||||
// for (auto * backend : lctx.backends) {
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||||
// if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
for (auto * backend : lctx.backends) {
|
||||||
// ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||||
// break;
|
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
|
||||||
// }
|
break;
|
||||||
// }
|
}
|
||||||
//}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
struct ggml_cgraph * result = NULL;
|
||||||
|
@ -13119,9 +13121,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
if (model->n_gpu_layers >= 0) { // TODO: make auto-offload configurable
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||||
|
@ -13141,7 +13142,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||||
|
@ -13297,14 +13297,17 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_backend_t backend = ctx->backends[i];
|
ggml_backend_t backend = ctx->backends[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
||||||
|
if (size > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||||
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue