From 5eec5d6ed9dfabf94dda4fd4705c4015ad0e82e2 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Tue, 25 Apr 2023 20:34:18 +0800
Subject: [PATCH] Added backwards compatibility to an earlier version of NeoX.

---
 expose.cpp          | 10 ++++++++--
 gpttype_adapter.cpp | 16 ++++++++++------
 koboldcpp.py        |  2 +-
 model_adapter.cpp   |  2 +-
 model_adapter.h     |  1 +
 otherarch/neox.cpp  | 39 ++++++++++++++++++++++++++++-----------
 6 files changed, 49 insertions(+), 21 deletions(-)
diff --git a/expose.cpp b/expose.cpp
index 02f40b489..b19737106 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -117,10 +117,16 @@ extern "C"
                 return true;
             }
         }
-        else if(file_format==FileFormat::NEOX_1)
+        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
         {
             printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
-            ModelLoadResult lr = gpttype_load_model(inputs, file_format);          
+            ModelLoadResult lr = gpttype_load_model(inputs, file_format);
+            if (lr == ModelLoadResult::RETRY_LOAD)
+            {
+                file_format = FileFormat::NEOX_1;
+                printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
+                lr = gpttype_load_model(inputs, file_format);
+            }     
             if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
             {
                 return false;
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index df185405f..1d33a2243 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -218,13 +218,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
 
         return ModelLoadResult::SUCCESS;
     }
-    else if(file_format==FileFormat::NEOX_1)
+    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
     {
-        bool res = stablelm_model_load(params.model, neox_ctx, vocab);
-        if(!res)
+        ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);       
+        if(res==ModelLoadResult::FAIL)
         {
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return ModelLoadResult::FAIL;
+            return res;
+        }
+        else if(res==ModelLoadResult::RETRY_LOAD)
+        {
+            printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
+            return res;
         }
          // determine the required inference memory per token:    
         stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);    
@@ -245,8 +250,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
         }
 
         // determine the required inference memory per token:    
-        gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); 
-
+        gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
       
         //if the logits are NAN, it means the model is incompatible
         if(logits.size()>0 && IsNanCheck(logits[0]))
diff --git a/koboldcpp.py b/koboldcpp.py
index 8310b7566..35e8d2cbd 100644
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -148,7 +148,7 @@ maxctx = 2048
 maxlen = 128
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.13.1"
+KcppVersion = "1.14"
 
 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
     sys_version = ""
diff --git a/model_adapter.cpp b/model_adapter.cpp
index f6225dc55..6c54f3041 100644
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@@ -130,7 +130,7 @@ void print_tok_vec(std::vector<float> &embd)
        else if(vocabsiz < 31998 || vocabsiz > 33000)
        {
            //anything outside the llama v1 range is assumed to be NeoX
-           fileformat = FileFormat::NEOX_1;
+           fileformat = FileFormat::NEOX_2;
        }
     }
     else if(magic == 0x67676d66) //v2 format ggmf
diff --git a/model_adapter.h b/model_adapter.h
index 344643d2b..5c303a1fd 100644
--- a/model_adapter.h
+++ b/model_adapter.h
@@ -30,6 +30,7 @@ enum FileFormat
     RWKV_1=300,
 
     NEOX_1=400,
+    NEOX_2=401,
 };
 
 enum ModelLoadResult
diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp
index 203df0983..07eb26dbc 100644
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@@ -17,13 +17,13 @@
 
 
 // load the model's weights from a file
-bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab) {
+ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) {
     printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     auto fin = std::ifstream(fname, std::ios::binary);
     if (!fin) {
         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
+        return ModelLoadResult::FAIL;
     }
 
     // verify magic
@@ -32,7 +32,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
         fin.read((char *) &magic, sizeof(magic));
         if (magic != 0x67676d6c) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return ModelLoadResult::FAIL;
         }
     }
 
@@ -88,7 +88,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
                 {
                     fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                             __func__, fname.c_str(), model.hparams.ftype);
-                    return false;
+                    return ModelLoadResult::FAIL;
                 }
     }
 
@@ -151,7 +151,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
         model.ctx = ggml_init(params);
         if (!model.ctx) {
             fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
+            return ModelLoadResult::FAIL;
         }
     }
 
@@ -276,19 +276,19 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
 
             if (model.tensors.find(name.data()) == model.tensors.end()) {
                 fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
+                return ModelLoadResult::FAIL;
             }
 
             auto tensor = model.tensors[name.data()];
             if (ggml_nelements(tensor) != nelements) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
+                return ModelLoadResult::FAIL;
             }
 
             if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                 fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
                         __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
+                return ModelLoadResult::FAIL;
             }
 
             // for debugging
@@ -296,12 +296,29 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
                 printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if(file_format==FileFormat::NEOX_1)
+            {
+                switch (ttype) {
+                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
+                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
+                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
+                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                    case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                    case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
+                    default:
+                    {
+                        fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
+                        return ModelLoadResult::FAIL;
+                    }
+                };
+            }
 
             if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                 fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                         __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
+                 return ModelLoadResult::RETRY_LOAD;
             }
 
             fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
@@ -320,7 +337,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
 
     fin.close();
 
-    return true;
+    return ModelLoadResult::SUCCESS;
 }
 
 // evaluate the transformer