Apply suggestions from code review

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-09-16 01:20:53 +08:00 · 2023-09-16 01:20:53 +08:00 · e30ad7143f
commit e30ad7143f
parent eafcc34f0a
1 changed files with 0 additions and 8 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3597,16 +3597,8 @@ static struct ggml_cgraph * llm_build_starcoder(
                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
            ggml_set_name(V, "V");
 #if 1
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
            ggml_set_name(KQV, "KQV");
 #else
            // make V contiguous in memory to speed up the matmul, however we waste time on the copy
            // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
            // is there a better way?
            struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
 #endif
            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);