From 83c96d580965209c8d88f203524af69de32642e2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 2 Nov 2023 18:40:33 +0200 Subject: [PATCH] sync : ggml-cuda ggml-ci --- ggml-cuda.cu | 16 ++++++++++++---- ggml.c | 3 ++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e46295126..e470e5bd6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,6 +81,7 @@ #include "ggml-cuda.h" #include "ggml.h" +#include "ggml-backend-impl.h" #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define CC_VOLTA 700 @@ -7649,11 +7650,11 @@ static size_t g_temp_tensor_extra_index = 0; static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; } size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; + g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); @@ -7960,11 +7961,11 @@ struct ggml_backend_buffer_context_cuda { ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; } size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); @@ -8050,7 +8051,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe ggml_cuda_set_device(g_main_device); ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda; + + size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 + + ggml_cuda_set_device(g_main_device); CUDA_CHECK(cudaMalloc(&ctx->device, size)); + return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size); } @@ -8117,6 +8123,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + continue; assert(node->backend == GGML_BACKEND_GPU); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { diff --git a/ggml.c b/ggml.c index 2445c80e3..46f032cd5 100644 --- a/ggml.c +++ b/ggml.c @@ -100,7 +100,8 @@ typedef void * thread_ret_t; #include #endif -#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +#if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \ + (!defined(TARGET_OS_TV)) #include