From 1657f92d2ff2652d8e3e71badf8690fc15279af9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 16 Feb 2024 10:41:38 +0200
Subject: [PATCH] llama : init kq_pos only if needed

ggml-ci
---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 8e872f3d7..7d6c90872 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1561,6 +1561,7 @@ struct llama_hparams {
     float f_max_alibi_bias = 0.0f;
 
     bool causal_attn = true;
+    bool need_kq_pos = false;
 
     uint32_t pooling_type = LLAMA_POOLING_NONE;
 
@@ -3242,6 +3243,10 @@ static void llm_load_hparams(
     }
 
     model.ftype = ml.ftype;
+
+    if (hparams.f_max_alibi_bias > 0.0f) {
+        hparams.need_kq_pos = true;
+    }
 }
 
 // TODO: This should probably be in llama.h
@@ -7529,7 +7534,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    {
+    if (hparams.need_kq_pos) {
         const int64_t n_kv = kv_self.n;
 
         assert(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));