From 2f2eff6e13b9f19f52800b915f49dc2b73596307 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 8 May 2023 20:58:00 +0800
Subject: [PATCH] the dark gods have been sated, and redpajama is integrated...
 but at what cost?

---
 expose.cpp          | 10 ++++++++--
 gpttype_adapter.cpp | 42 ++++++++++++++++++++++++++++--------------
 model_adapter.h     |  1 +
 otherarch/neox.cpp  | 27 ++++++++++++++++++++-------
 4 files changed, 57 insertions(+), 23 deletions(-)
diff --git a/expose.cpp b/expose.cpp
index 38defd15e..df426d82e 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -128,16 +128,22 @@ extern "C"
                 return true;
             }
         }
-        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
+        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
         {
             printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
             ModelLoadResult lr = gpttype_load_model(inputs, file_format);
             if (lr == ModelLoadResult::RETRY_LOAD)
+            {
+                file_format = FileFormat::NEOX_3;
+                printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
+                lr = gpttype_load_model(inputs, file_format);
+            }
+            if (lr == ModelLoadResult::RETRY_LOAD)
             {
                 file_format = FileFormat::NEOX_1;
                 printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
                 lr = gpttype_load_model(inputs, file_format);
-            }     
+            }    
             if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
             {
                 return false;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index ba7a0c2cd..f0a12bcaf 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -369,7 +369,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
 
         return ModelLoadResult::SUCCESS;
     }
-    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
+    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
     {
         ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);       
         if(res==ModelLoadResult::FAIL)
@@ -383,7 +383,23 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
             return res;
         }
          // determine the required inference memory per token:    
-        stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);    
+        stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
+
+        if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
+        {
+            //run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
+            std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");           
+            stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3);          
+            int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
+            std::string predicted = vocab.id_to_token[topid].c_str();
+            if(predicted.find("8") != std::string::npos)
+            {
+                printf("\n---\nRedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
+                ggml_free(neox_ctx.ctx);
+                return ModelLoadResult::RETRY_LOAD;
+            }
+        }
+
         return ModelLoadResult::SUCCESS;
     }
     else
@@ -514,13 +530,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     }
 
     //if using BLAS and prompt is big enough, switch to single thread and use a huge batch
-    bool approved_format = (file_format == FileFormat::GGML ||
-                            file_format == FileFormat::GGHF || 
-                            file_format == FileFormat::GGJT ||
-                            file_format == FileFormat::GPT2_2 || 
-                            file_format == FileFormat::GPTJ_3 ||
-                            file_format == FileFormat::NEOX_1 || 
-                            file_format == FileFormat::NEOX_2);
+    bool approved_format = !(file_format == FileFormat::BADFORMAT ||
+                            file_format == FileFormat::GPT2_1 || 
+                            file_format == FileFormat::GPTJ_1 ||
+                            file_format == FileFormat::GPTJ_2 || 
+                            file_format == FileFormat::RWKV_1);
     bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
     // bool blasmode = false;
     int original_batch = params.n_batch;
@@ -579,7 +593,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     {
         n_vocab = gpt2_ctx_v2.hparams.n_vocab;
     }
-    else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2)
+    else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
     {
         n_vocab = neox_ctx.hparams.n_vocab;
     }
@@ -614,14 +628,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
         {
             for (auto id : embd_inp)
             {
-                printf("'%s', ",llama_token_to_str(llama_ctx_v1, id));
+                printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id);
             }
         }
         else
         {
             for (auto id : embd_inp)
             {
-                printf("'%s', ",vocab.id_to_token[id].c_str());
+                printf("'%s (%d)', ",vocab.id_to_token[id].c_str(),id);
             }
         }
         printf("\n");
@@ -665,9 +679,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
             {
                 evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
             }
-            else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2)
+            else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
             {
-                evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token);
+                evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
             }
             else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
             {
diff --git a/model_adapter.h b/model_adapter.h
index 5c303a1fd..3e3f8f30c 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -31,6 +31,7 @@ enum FileFormat
 
     NEOX_1=400,
     NEOX_2=401,
+    NEOX_3=402,
 };
 
 enum ModelLoadResult
diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp
index 47397c309..fb6b4b996 100644
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@@ -345,7 +345,8 @@ bool stablelm_eval(
         const int n_past,
         const std::vector<gpt_vocab::id> & embd_inp,
               std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
+              size_t                     & mem_per_token,
+              FileFormat file_format) {
     const int N = embd_inp.size();
 
     const auto & hparams = model.hparams;
@@ -494,6 +495,12 @@ bool stablelm_eval(
             }
         }
 
+        if(file_format==FileFormat::NEOX_3)
+        {
+            // layer input + Attn
+            cur  = ggml_add(ctx0, cur, inpL);
+        }
+
         struct ggml_tensor * inpFF = cur;
 
         // feed-forward network
@@ -502,7 +509,7 @@ bool stablelm_eval(
             // post attention layer norm
             // note here we pass inpL instead of cur
             {
-                cur = ggml_norm(ctx0, inpL);
+                cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL));
 
                 cur = ggml_add(ctx0,
                     ggml_mul(ctx0,
@@ -533,11 +540,17 @@ bool stablelm_eval(
                     cur);
         }
 
-        // layer input + FF
-        cur  = ggml_add(ctx0, cur, inpFF);
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpL);
+        if (file_format == FileFormat::NEOX_3)
+        {
+            // layer input + FF
+            inpL = ggml_add(ctx0, cur, inpFF);
+        }
+        else
+        {
+            cur = ggml_add(ctx0, cur, inpFF);
+            // input for next layer
+            inpL = ggml_add(ctx0, cur, inpL);
+        }
     }
 
     // norm