From 0ec27ad66c56a4831f62a8106b7873a6818bf051 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 22 Aug 2023 23:11:41 +0300 Subject: [PATCH] falcon : minor --- ggml-alloc.c | 4 ++-- ggml-alloc.h | 2 +- llama.cpp | 26 ++++++++++++-------------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index f06f9a3c1..547ec0399 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -238,7 +238,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t alloc->n_free_blocks++; } -void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { +void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) { int pos = 0; for (int i = 0; i < n; i++) { if (list[i] != -1) { @@ -547,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n( struct ggml_tensor * view_src = get_view_source(parent); struct hash_node * view_src_hn = hash_get(ht, view_src); view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); + AT_PRINTF("view_src %s\n", view_src->name); if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { ggml_allocator_free_tensor(alloc, view_src); } diff --git a/ggml-alloc.h b/ggml-alloc.h index 14a4350ac..9559da758 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); // tell the allocator to parse nodes following the order described in the list // you should call this if your graph are optimized to execute out-of-order -GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); +GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); diff --git a/llama.cpp b/llama.cpp index d1a6e1be2..b65bf9461 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2436,7 +2436,7 @@ static struct ggml_cgraph * llm_build_falcon( attn_norm), ggml_repeat(ctx0, model.layers[il].attn_norm_b, attn_norm)); - if (hparams.n_head_kv == 8) { // Falcon-40B + if (model.layers[il].attn_norm_2) { // Falcon-40B cur = ggml_norm(ctx0, inpL); cur = ggml_add(ctx0, @@ -2461,23 +2461,25 @@ static struct ggml_cgraph * llm_build_falcon( // trickery when trying to accurately dump these views for // debugging. + const size_t wsize = ggml_type_size(cur->type); + struct ggml_tensor * Qcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head, N, - n_embd_head * ggml_type_size(GGML_TYPE_F32), - n_embd_head * (n_head + 2 * n_head_kv) * ggml_type_size(GGML_TYPE_F32), + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), 0); struct ggml_tensor * Kcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, - n_embd_head * ggml_type_size(GGML_TYPE_F32), - n_embd_head * (n_head + 2 * n_head_kv) * ggml_type_size(GGML_TYPE_F32), - n_embd_head * n_head * ggml_type_size(GGML_TYPE_F32)); + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head); struct ggml_tensor * Vcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, N, - n_embd_head * ggml_type_size(GGML_TYPE_F32), - n_embd_head * (n_head + 2 * n_head_kv) * ggml_type_size(GGML_TYPE_F32), - n_embd_head * (n_head + n_head_kv) * ggml_type_size(GGML_TYPE_F32)); + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); // using mode = 2 for neox mode Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_embd_head, 2, 0); @@ -2518,11 +2520,7 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd_head))) - ); + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);