address review comments
This commit is contained in:
parent
c522c112b3
commit
9d4ba6ed07
3 changed files with 30 additions and 26 deletions
|
@ -1006,7 +1006,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
// pass 2: assign backends to ops from current assignments
|
// pass 2: assign backends to ops from current assignments
|
||||||
// start from the end and assign the same backend to previous ops
|
// start from the end and assign the same backend to previous ops
|
||||||
|
|
||||||
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu
|
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
|
||||||
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
||||||
|
|
||||||
// pass 2.1 expand gpu up
|
// pass 2.1 expand gpu up
|
||||||
|
@ -1020,7 +1020,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
if (node_allocr != NULL) {
|
if (node_allocr != NULL) {
|
||||||
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
||||||
// skip cpu
|
// skip cpu (lowest prio backend)
|
||||||
cur_allocr = NULL;
|
cur_allocr = NULL;
|
||||||
} else {
|
} else {
|
||||||
cur_allocr = node_allocr;
|
cur_allocr = node_allocr;
|
||||||
|
@ -1043,7 +1043,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
ggml_tallocr_t node_allocr = node_allocr(node);
|
ggml_tallocr_t node_allocr = node_allocr(node);
|
||||||
if (node_allocr != NULL) {
|
if (node_allocr != NULL) {
|
||||||
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
|
||||||
// skip cpu
|
// skip cpu (lowest prio backend)
|
||||||
cur_allocr = NULL;
|
cur_allocr = NULL;
|
||||||
} else {
|
} else {
|
||||||
cur_allocr = node_allocr;
|
cur_allocr = node_allocr;
|
||||||
|
@ -1107,9 +1107,10 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
int cur_split = 0;
|
int cur_split = 0;
|
||||||
|
// find the backend of the first split, skipping view ops
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (node->view_src == NULL) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
sched->splits[0].tallocr = node_allocr(node);
|
sched->splits[0].tallocr = node_allocr(node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
43
ggml-cuda.cu
43
ggml-cuda.cu
|
@ -9856,30 +9856,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des
|
||||||
|
|
||||||
#define UNUSED GGML_UNUSED
|
#define UNUSED GGML_UNUSED
|
||||||
|
|
||||||
struct ggml_backend_context_cuda {
|
struct ggml_backend_cuda_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
};
|
};
|
||||||
|
|
||||||
// cuda buffer
|
// cuda buffer
|
||||||
|
|
||||||
struct ggml_backend_buffer_context_cuda {
|
struct ggml_backend_cuda_buffer_context {
|
||||||
int device;
|
int device;
|
||||||
void * dev_ptr = nullptr;
|
void * dev_ptr = nullptr;
|
||||||
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
|
||||||
size_t temp_tensor_extra_index = 0;
|
size_t temp_tensor_extra_index = 0;
|
||||||
std::string name;
|
std::string name;
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda(int device, void * dev_ptr) :
|
ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
|
||||||
device(device), dev_ptr(dev_ptr),
|
device(device), dev_ptr(dev_ptr),
|
||||||
name(GGML_CUDA_NAME + std::to_string(device)) {
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||||
}
|
}
|
||||||
|
|
||||||
~ggml_backend_buffer_context_cuda() {
|
~ggml_backend_cuda_buffer_context() {
|
||||||
delete[] temp_tensor_extras;
|
delete[] temp_tensor_extras;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
|
// TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
|
||||||
if (temp_tensor_extras == nullptr) {
|
if (temp_tensor_extras == nullptr) {
|
||||||
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
@ -9894,23 +9895,23 @@ struct ggml_backend_buffer_context_cuda {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
return ctx->name.c_str();
|
return ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
CUDA_CHECK(cudaFree(ctx->dev_ptr));
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
return ctx->dev_ptr;
|
return ctx->dev_ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
assert(tensor->view_src->buffer->buft == buffer->buft);
|
assert(tensor->view_src->buffer->buft == buffer->buft);
|
||||||
|
@ -9944,7 +9945,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
|
||||||
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
@ -9955,7 +9956,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||||
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
@ -9963,7 +9964,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
|
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx->device);
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
@ -10011,7 +10012,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr);
|
ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
|
||||||
}
|
}
|
||||||
|
@ -10048,7 +10049,7 @@ static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_t
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
return buft_ctx->device == cuda_ctx->device;
|
return buft_ctx->device == cuda_ctx->device;
|
||||||
}
|
}
|
||||||
|
@ -10420,26 +10421,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
|
||||||
// backend
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
return cuda_ctx->name.c_str();
|
return cuda_ctx->name.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
static void ggml_backend_cuda_free(ggml_backend_t backend) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
delete cuda_ctx;
|
delete cuda_ctx;
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
@ -10448,7 +10449,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
|
||||||
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
||||||
|
@ -10457,7 +10458,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
|
||||||
|
|
||||||
|
@ -10465,7 +10466,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
|
||||||
|
@ -10642,7 +10643,7 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||||
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
// not strictly necessary, but it may reduce the overhead of the first graph_compute
|
||||||
ggml_cuda_set_main_device(device);
|
ggml_cuda_set_main_device(device);
|
||||||
|
|
||||||
ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
|
ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
|
||||||
/* .device = */ device,
|
/* .device = */ device,
|
||||||
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
/* .name = */ GGML_CUDA_NAME + std::to_string(device),
|
||||||
};
|
};
|
||||||
|
|
|
@ -9267,6 +9267,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backend_metal = ggml_backend_metal_init();
|
ctx->backend_metal = ggml_backend_metal_init();
|
||||||
if (ctx->backend_metal == nullptr) {
|
if (ctx->backend_metal == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
||||||
|
llama_free(ctx);
|
||||||
|
return nullptr;
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
|
@ -9298,7 +9300,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backend_cpu = ggml_backend_cpu_init();
|
ctx->backend_cpu = ggml_backend_cpu_init();
|
||||||
if (ctx->backend_cpu == nullptr) {
|
if (ctx->backend_cpu == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
||||||
delete ctx;
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(ctx->backend_cpu);
|
ctx->backends.push_back(ctx->backend_cpu);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue