From ea708819416a3ba65f76d42a107e3af1a9689fb9 Mon Sep 17 00:00:00 2001
From: Jan Ploski <jpl@plosquare.com>
Date: Sat, 17 Jun 2023 04:48:40 +0200
Subject: [PATCH] Made option --memory-f32 enabled by default since
 ggml_repeat2 currently only has F32 implementation. Improved memory
 allocation for ctx and kv memory to be accurate. Moved model.memory_k,
 model.memory_v to kv_self.k, kv_self.v and the initialization into
 kv_cache_init (to be more like llama.cpp).

---
 examples/falcon_common.cpp |  2 +-
 libfalcon.cpp              | 83 ++++++++++++++------------------------
 2 files changed, 32 insertions(+), 53 deletions(-)

diff --git a/examples/falcon_common.cpp b/examples/falcon_common.cpp
index 9aa6d2942..8acde68b6 100644
--- a/examples/falcon_common.cpp
+++ b/examples/falcon_common.cpp
@@ -529,7 +529,7 @@ struct falcon_context * falcon_init_from_gpt_params(const gpt_params & params) {
     lparams.main_gpu     = params.main_gpu;
     memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
     lparams.seed         = params.seed;
-    lparams.f16_kv       = params.memory_f16;
+    lparams.f16_kv       = false; //params.memory_f16; // TODO? unsupported because ggml_repeat2 currently only implemented for f32
     lparams.use_mmap     = params.use_mmap;
     lparams.use_mlock    = params.use_mlock;
     lparams.logits_all   = params.perplexity;
diff --git a/libfalcon.cpp b/libfalcon.cpp
index 96d7c6f5f..cdb38eba0 100644
--- a/libfalcon.cpp
+++ b/libfalcon.cpp
@@ -81,16 +81,6 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
     return k_sizes;
 }
 
-// 2*n_embd*n_ctx*n_layer*sizeof(float16)
-static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
-{
-    static std::map<e_model, size_t> k_sizes = {
-        { FALCON_7B,   1026ull * MB },
-        { FALCON_40B,  5120ull * MB },
-    };
-    return k_sizes;
-}
-
 // this is mostly needed for temporary mul_mat buffers to dequantize the data
 // not actually needed if BLAS is disabled
 static const std::map<e_model, size_t> & MEM_REQ_EVAL()
@@ -118,6 +108,18 @@ struct falcon_hparams {
     }
 };
 
+static size_t MEM_REQ_KV_SELF(
+    const falcon_hparams & hparams, ggml_type wtype, int32_t n_ctx)
+{
+    const int n_head_kv = hparams.n_head_kv;
+    const int head_dim = hparams.n_embd / hparams.n_head;
+    const int n_layer = hparams.n_layer;
+
+    const int64_t ne = n_head_kv * head_dim * n_layer * n_ctx;
+
+    return 2u * (ggml_tensor_overhead() + ne * ggml_type_size(wtype));
+}
+
 struct falcon_layer {
     // normalization
     struct ggml_tensor* input_layernorm;
@@ -164,9 +166,6 @@ struct falcon_model {
 
     std::vector<falcon_layer> layers;
 
-    // key + value memory
-    struct ggml_tensor* memory_k;
-    struct ggml_tensor* memory_v;
     int n_gpu_layers;
 
     // context
@@ -687,8 +686,7 @@ struct llama_model_loader {
     void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
         *ctx_size_p = *mmapped_size_p = 0;
         for (const falcon_load_tensor & lt : tensors_map.tensors) {
-            *ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
-            *ctx_size_p += 64 * MB;
+            *ctx_size_p += ggml_tensor_overhead();
             *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size;
         }
     }
@@ -871,14 +869,12 @@ static bool kv_cache_init(
              struct falcon_kv_cache & cache,
                          ggml_type   wtype,
                                int   n_ctx) {
-    
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
 
-    const int64_t n_mem      = n_layer*n_ctx;
-    const int64_t n_elements = n_embd*n_mem;
+    const int64_t head_dim = hparams.n_embd / hparams.n_head;
+    const int64_t n_elements =
+        hparams.n_layer * n_ctx * head_dim * hparams.n_head_kv;
 
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize(MEM_REQ_KV_SELF(hparams, wtype, n_ctx));
 
     struct ggml_init_params params;
     params.mem_size   = cache.buf.size;
@@ -908,7 +904,7 @@ struct falcon_context_params falcon_context_default_params() {
         /*.main_gpu                    =*/ 0,
         /*.tensor_split                =*/ {0},
         /*.seed                        =*/ -1,
-        /*.f16_kv                      =*/ true,
+        /*.f16_kv                      =*/ false,
         /*.logits_all                  =*/ false,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
@@ -1220,41 +1216,24 @@ static void falcon_model_load_internal(
         }
     }
 
-    // key + value memory
-    {
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_head_kv = hparams.n_head_kv;
-        const int head_dim = hparams.n_embd / hparams.n_head;
-
-        const int64_t n_mem      = n_layer*n_ctx;
-        const int64_t n_elements = head_dim*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_head_kv * n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: (a) memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
     ml->done_getting_tensors();
 
     // print memory requirements
     {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
         // this is the total memory required to run the inference
-        const size_t mem_required =
+        // TODO: this calculation is still wrong
+        int64_t mem_required =
             ctx_size +
             mmapped_size - vram_weights + // weights in VRAM not in memory
             MEM_REQ_SCRATCH0().at(model.type) +
             MEM_REQ_SCRATCH1().at(model.type) +
             MEM_REQ_EVAL().at    (model.type);
 
+        if (mem_required < 0) mem_required = 0;
+
         // this is the memory required by one llama_state
         const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
+            MEM_REQ_KV_SELF(model.hparams, memory_type, n_ctx);
 
         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1482,13 +1461,13 @@ static bool falcon_eval_internal(
             // store key and value to memory
             {
                 struct ggml_tensor* k = ggml_view_1d(
-                    ctx0, model.memory_k, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_k) * n_head_kv * head_dim) *
+                    ctx0, kv_self.k, N * n_head_kv * head_dim,
+                    (ggml_element_size(kv_self.k) * n_head_kv * head_dim) *
                         (il * n_ctx + n_past));
                 ggml_set_name(k, "k");
                 struct ggml_tensor* v = ggml_view_1d(
-                    ctx0, model.memory_v, N * n_head_kv * head_dim,
-                    (ggml_element_size(model.memory_v) * n_head_kv * head_dim) *
+                    ctx0, kv_self.v, N * n_head_kv * head_dim,
+                    (ggml_element_size(kv_self.v) * n_head_kv * head_dim) *
                         (il * n_ctx + n_past));
                 ggml_set_name(v, "v");
 
@@ -1500,9 +1479,9 @@ static bool falcon_eval_internal(
                 ctx0,
                 ggml_reshape_3d(
                     ctx0,
-                    ggml_view_1d(ctx0, model.memory_k, (n_past + N) * n_head_kv * head_dim,
+                    ggml_view_1d(ctx0, kv_self.k, (n_past + N) * n_head_kv * head_dim,
                                  il * n_ctx *
-                                     ggml_element_size(model.memory_k) *
+                                     ggml_element_size(kv_self.k) *
                                      n_head_kv *
                                      head_dim),
                     head_dim, n_head_kv, n_past + N),
@@ -1539,9 +1518,9 @@ static bool falcon_eval_internal(
                 ctx0,
                 ggml_reshape_3d(
                     ctx0,
-                    ggml_view_1d(ctx0, model.memory_v, (n_past + N) * n_head_kv * head_dim,
+                    ggml_view_1d(ctx0, kv_self.v, (n_past + N) * n_head_kv * head_dim,
                                  il * n_ctx *
-                                     ggml_element_size(model.memory_v) *
+                                     ggml_element_size(model.kv_self.v) *
                                      n_head_kv *
                                      head_dim),
                     head_dim, n_head_kv, n_past + N),