Merge branch 'custom-attention-mask' into cam-cuda-2

2023-09-20 09:17:48 +03:00 · 2023-09-20 09:17:48 +03:00 · 2e92aefef3
commit 2e92aefef3
parent 4c0f243787 e1067efbfa
2 changed files with 11 additions and 8 deletions
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -326,9 +326,10 @@ int main(int argc, char ** argv) {
                    const auto t_main_end = ggml_time_us();
-                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, cache miss %d \033[0m: \n\nInput:    %s\nResponse: %s\n\n",
+                    LOG_TEE("\033[1mClient %3d, seq %4d, prompt %4d t, response %4d t, time %5.2f s, speed %5.2f t/s, cache miss %d \033[0m \n\nInput:    %s\nResponse: %s\n\n",
                            client.id, client.seq_id, client.n_prompt, client.n_decoded,
                            (t_main_end - client.t_start_prompt) / 1e6,
                            (double) (client.n_prompt + client.n_decoded) / (t_main_end - client.t_start_prompt) * 1e6,
                            n_cache_miss,
                            ::trim(client.input).c_str(),
                            ::trim(client.response).c_str());
--- a/llama.cpp
+++ b/llama.cpp
@ -1025,7 +1025,7 @@ struct llama_kv_cache {
    uint32_t size = 0;
    // computed before each graph build
-    uint32_t cell_max = 0;
+    uint32_t n = 0;
    std::vector<llama_kv_cell> cells;
@ -2619,7 +2619,7 @@ static struct ggml_cgraph * llm_build_llama(
    const int n_gpu_layers = model.n_gpu_layers;
    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : std::max(1, (int)kv_self.cell_max);
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@ -3011,7 +3011,7 @@ static struct ggml_cgraph * llm_build_baichaun(
    const int n_gpu_layers = model.n_gpu_layers;
    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@ -3418,7 +3418,7 @@ static struct ggml_cgraph * llm_build_falcon(
    const int n_gpu_layers = model.n_gpu_layers;
    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
    const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
@ -3783,7 +3783,7 @@ static struct ggml_cgraph * llm_build_starcoder(
    const float norm_eps = hparams.f_norm_eps;
    const int32_t n_tokens = batch.n_tokens;
-    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.cell_max;
+    const int32_t n_kv     = ggml_allocr_is_measure(lctx.alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
    auto & buf_compute = lctx.buf_compute;
@ -4115,8 +4115,10 @@ static int llama_decode_internal(
    // a heuristic, to avoid attending the full cache if it is not yet utilized
    // after enough generations, the benefit from this heuristic disappears
    // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.cell_max = llama_kv_cache_cell_max(kv_self);
+    //kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32));   // TODO: this might be better for CUDA?
-    //printf("kv_self.cell_max = %d\n", kv_self.cell_max);
+    kv_self.n = std::max(32, llama_kv_cache_cell_max(kv_self));
    //printf("kv_self.n = %d\n", kv_self.n);
    ggml_allocr_reset(lctx.alloc);