diff --git a/ggml-backend.c b/ggml-backend.c index 535426b9a..9b5f1ffba 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1006,7 +1006,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2: assign backends to ops from current assignments // start from the end and assign the same backend to previous ops - // expand gpu backends (i.e. non last prio) up and down, ignoring cpu + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // pass 2.1 expand gpu up @@ -1020,7 +1020,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { - // skip cpu + // skip cpu (lowest prio backend) cur_allocr = NULL; } else { cur_allocr = node_allocr; @@ -1043,7 +1043,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { - // skip cpu + // skip cpu (lowest prio backend) cur_allocr = NULL; } else { cur_allocr = node_allocr; @@ -1107,9 +1107,10 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 4: split graph, find tensors that need to be copied { int cur_split = 0; + // find the backend of the first split, skipping view ops for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (node->view_src == NULL) { + if (!ggml_is_view_op(node->op)) { sched->splits[0].tallocr = node_allocr(node); break; } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f83d60fa3..b0435b81f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9856,30 +9856,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des #define UNUSED GGML_UNUSED -struct ggml_backend_context_cuda { +struct ggml_backend_cuda_context { int device; std::string name; }; // cuda buffer -struct ggml_backend_buffer_context_cuda { +struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; std::string name; - ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : + ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr), name(GGML_CUDA_NAME + std::to_string(device)) { } - ~ggml_backend_buffer_context_cuda() { + ~ggml_backend_cuda_buffer_context() { delete[] temp_tensor_extras; } ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset if (temp_tensor_extras == nullptr) { temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } @@ -9894,23 +9895,23 @@ struct ggml_backend_buffer_context_cuda { }; static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->name.c_str(); } static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { assert(tensor->view_src->buffer->buft == buffer->buft); @@ -9944,7 +9945,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); @@ -9955,7 +9956,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); @@ -9963,7 +9964,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co } static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); @@ -10011,7 +10012,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac return nullptr; } - ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr); + ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } @@ -10048,7 +10049,7 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t } ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return buft_ctx->device == cuda_ctx->device; } @@ -10420,26 +10421,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend static const char * ggml_backend_cuda_name(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return cuda_ctx->name.c_str(); } static void ggml_backend_cuda_free(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10448,7 +10449,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens } static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10457,7 +10458,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm } static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); @@ -10465,7 +10466,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { } static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10642,7 +10643,7 @@ ggml_backend_t ggml_backend_cuda_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_cuda_set_main_device(device); - ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context { /* .device = */ device, /* .name = */ GGML_CUDA_NAME + std::to_string(device), }; diff --git a/llama.cpp b/llama.cpp index b2e2ca298..7a573d77b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9267,6 +9267,8 @@ struct llama_context * llama_new_context_with_model( ctx->backend_metal = ggml_backend_metal_init(); if (ctx->backend_metal == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + llama_free(ctx); + return nullptr; } ctx->backends.push_back(ctx->backend_metal); } @@ -9298,7 +9300,7 @@ struct llama_context * llama_new_context_with_model( ctx->backend_cpu = ggml_backend_cpu_init(); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - delete ctx; + llama_free(ctx); return nullptr; } ctx->backends.push_back(ctx->backend_cpu);