From 1665ad8bf1780de80e1a5e7d99761a3ebcbde4ee Mon Sep 17 00:00:00 2001
From: Leon Ericsson <leon.ericsson@icloud.com>
Date: Fri, 15 Dec 2023 14:14:17 +0100
Subject: [PATCH] BUG: generates gibberish/repeating tokens after a while

---
 common/common.h            |  6 ++--
 examples/lookup/lookup.cpp | 61 +++++++++++++++++++-------------------
 2 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/common/common.h b/common/common.h
index 3acdfdd74..8c73da247 100644
--- a/common/common.h
+++ b/common/common.h
@@ -75,10 +75,10 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model             = "models/7B/ggml-model-q4_0.gguf"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "Hello my name is";
+    std::string prompt            = "";
     std::string prompt_file       = "";  // store the external prompt file name
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
@@ -228,4 +228,4 @@ void dump_non_result_info_yaml(
 void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
\ No newline at end of file
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 3f3ad10b9..a9347e51e 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -122,16 +122,7 @@ int main(int argc, char ** argv){
 
             draft.clear();
             draft.push_back(id);
-            // drafts[0].i_batch_tgt.push_back(0);
-
-            // llama_batch_clear(batch_dft);
-            // llama_batch_add  (batch_dft, id, n_past_dft, { 0 }, true);
-
-            // llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
-            // // LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            // llama_decode         (ctx_dft, batch_dft);
-
-            // ++n_past_dft;
+            inp.push_back(id);
             break;
         }
 
@@ -142,33 +133,41 @@ int main(int argc, char ** argv){
         llama_batch_clear(batch_tgt);
         llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
 
-        bool match = false;
         // generate n_pred tokens through prompt lookup
-        for (int ngram_size = max_ngram_size ; ngram_size > 0; --ngram_size){
-            if (match){
-                break;
-            }
-            const auto & prev = ctx_sampling->prev;
-            int prev_size = prev.size();
-            const llama_token * ngram = &prev[prev_size - ngram_size];
-            
-            for (int i = 0; i <= (int) prev_size - (ngram_size * 2); ++i) {
-                if (prev[i] == ngram[0] && prev[i + 1] == ngram[1] && prev[i + 2] == ngram[2]) {
-                    const int startIdx = i + ngram_size;
-                    const int endIdx = startIdx + n_draft;
-                    if (endIdx < prev_size){
-                        match = true;
-                        for (int j = startIdx; j < endIdx; ++j) {
-                            LOG(" - draft candidate %d: %d\n", j, prev[j]);
-                            draft.push_back(prev[j]);
-                            llama_batch_add(batch_tgt, prev[j], n_past + j + 1, { 1 }, true);
-                            ++n_drafted;
+        auto prompt_lookup = [&]() -> void {
+            int inp_size = inp.size();
+            for (int ngram_size = max_ngram_size ; ngram_size > 0; --ngram_size){    
+                const llama_token * ngram = &inp[inp_size - ngram_size];
+                
+                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
+                    bool match = true;
+                    for (int j = 0; j < ngram_size; ++j) {
+                        if (inp[i + j] != ngram[j]) {
+                            match = false;
+                            break;
+                        }
+                    }
+
+                    if (match) {
+                        const int startIdx = i + ngram_size;
+                        const int endIdx = startIdx + n_draft;
+                        if (endIdx < inp_size){
+                            for (int j = startIdx; j < endIdx; ++j) {
+                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
+                                draft.push_back(inp[j]);
+                                llama_batch_add(batch_tgt, inp[j], n_past + j + 1, { 0 }, true);
+                                ++n_drafted;
+                            }
+                            return;
                         }
                     }
                 }
             }
-        }
+            return;
+        };
 
+        prompt_lookup();
+    
         llama_decode(ctx, batch_tgt);
         ++n_past;