From e30ad7143f749441621721a9309ecc6955297f09 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sat, 16 Sep 2023 01:20:53 +0800 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Georgi Gerganov --- llama.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4e081123f..0f277b14f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3597,16 +3597,8 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); ggml_set_name(V, "V"); -#if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);