From 0c268a83e866e49685c75f8274338e84d90cc084 Mon Sep 17 00:00:00 2001 From: lshzh-ww Date: Thu, 24 Aug 2023 01:34:57 -0400 Subject: [PATCH] ggml-alloc: avoid return silently In certain cases, the allocate_node() function may silently return without performing any memory allocation. --- ggml-alloc.c | 4 ++-- llama.cpp | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 7f1328566..af4affa4e 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -441,8 +441,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); node->data = parent->data; + return; } - return; } } } @@ -528,7 +528,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( } AT_PRINTF("\n"); } - + // update parents // update immediately if there is no parse_seq diff --git a/llama.cpp b/llama.cpp index f2dc4da1d..c1407b70b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2704,11 +2704,6 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - - // TODO: this is temporary needed to introduce artificial dependency between FF and ATTN - // adding this, because there seems to be a bug in the Metal concurrency optimization - // without this line, the results are non-deterministic and wrong - cur->src[2] = attn_out; offload_func(cur); cur = ggml_gelu(ctx0, cur);