sync : ggml-cuda

ggml-ci
This commit is contained in:
Georgi Gerganov 2023-11-02 18:40:33 +02:00
parent 4fe646ffbe
commit 83c96d5809
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 14 additions and 5 deletions

View file

@ -81,6 +81,7 @@
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-backend-impl.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700 #define CC_VOLTA 700
@ -7649,11 +7650,11 @@ static size_t g_temp_tensor_extra_index = 0;
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) { if (g_temp_tensor_extras == nullptr) {
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
} }
size_t alloc_index = g_temp_tensor_extra_index; size_t alloc_index = g_temp_tensor_extra_index;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
@ -7960,11 +7961,11 @@ struct ggml_backend_buffer_context_cuda {
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) { if (temp_tensor_extras == nullptr) {
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
} }
size_t alloc_index = temp_tensor_extra_index; size_t alloc_index = temp_tensor_extra_index;
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES; temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
@ -8050,7 +8051,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
ggml_cuda_set_device(g_main_device); ggml_cuda_set_device(g_main_device);
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda; ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMalloc(&ctx->device, size)); CUDA_CHECK(cudaMalloc(&ctx->device, size));
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size); return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
} }
@ -8117,6 +8123,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i]; ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
continue;
assert(node->backend == GGML_BACKEND_GPU); assert(node->backend == GGML_BACKEND_GPU);
for (int j = 0; j < GGML_MAX_SRC; j++) { for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) { if (node->src[j] != nullptr) {

3
ggml.c
View file

@ -100,7 +100,8 @@ typedef void * thread_ret_t;
#include <hbwmalloc.h> #include <hbwmalloc.h>
#endif #endif
#if defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) #if (defined(__linux__) || defined(__APPLE__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__)) && \
(!defined(TARGET_OS_TV))
#include <sys/wait.h> #include <sys/wait.h>