sync : ggml (backend v2) (#3912)

* sync : ggml (backend v2) (wip)

* sync : migrate examples and llama.cpp to dynamic graphs (wip)

* sync : update tests + fix max op params to 64

ggml-ci

* sync : ggml-cuda

ggml-ci

* llama : fix save/load state context size

ggml-ci

* sync : try to fix build on tvOS

* sync : pass custom graph sizes in training examples

* sync : update graph copies to new ggml API

* sync : update sync-ggml.sh with new files

* scripts : fix header in sync script

* train : fix context size calculations

* llama : increase inference graph size up to 4096 nodes

* train : allocate grads for backward graphs

* train : allocate grads for gb_tmp
This commit is contained in:
Georgi Gerganov 2023-11-13 14:16:23 +02:00 committed by GitHub
parent bb50a792ec
commit 4760e7cc0b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 1994 additions and 864 deletions

View file

@ -81,6 +81,7 @@
#include "ggml-cuda.h"
#include "ggml.h"
#include "ggml-backend-impl.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#define CC_VOLTA 700
@ -7751,11 +7752,11 @@ static size_t g_temp_tensor_extra_index = 0;
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) {
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
}
size_t alloc_index = g_temp_tensor_extra_index;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
@ -8070,11 +8071,11 @@ struct ggml_backend_buffer_context_cuda {
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) {
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
}
size_t alloc_index = temp_tensor_extra_index;
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
@ -8160,7 +8161,12 @@ static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backe
ggml_cuda_set_device(g_main_device);
ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaMalloc(&ctx->device, size));
return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
}
@ -8227,6 +8233,8 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
continue;
assert(node->backend == GGML_BACKEND_GPU);
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {