From e30ad7143f749441621721a9309ecc6955297f09 Mon Sep 17 00:00:00 2001
From: Meng Zhang <meng@tabbyml.com>
Date: Sat, 16 Sep 2023 01:20:53 +0800
Subject: [PATCH] Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 llama.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 4e081123f..0f277b14f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3597,16 +3597,8 @@ static struct ggml_cgraph * llm_build_starcoder(
                         ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
             ggml_set_name(V, "V");
 
-#if 1
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
             ggml_set_name(KQV, "KQV");
-#else
-            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
-            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
-            // is there a better way?
-            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
-#endif
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);