Added backwards compatibility to an earlier version of NeoX.

2023-04-25 20:34:18 +08:00 · 2023-04-25 20:34:18 +08:00 · 5eec5d6ed9
commit 5eec5d6ed9
parent bff998f871
6 changed files with 49 additions and 21 deletions
--- a/expose.cpp
+++ b/expose.cpp
@ -117,10 +117,16 @@ extern "C"
                return true;
            }
        }
-        else if(file_format==FileFormat::NEOX_1)
+        else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
        {
            printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
-            ModelLoadResult lr = gpttype_load_model(inputs, file_format);          
+            ModelLoadResult lr = gpttype_load_model(inputs, file_format);
+            if (lr == ModelLoadResult::RETRY_LOAD)
+            {
+                file_format = FileFormat::NEOX_1;
+                printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
+                lr = gpttype_load_model(inputs, file_format);
+            }     
            if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
            {
                return false;
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@ -218,13 +218,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in

        return ModelLoadResult::SUCCESS;
    }
-    else if(file_format==FileFormat::NEOX_1)
+    else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
    {
-        bool res = stablelm_model_load(params.model, neox_ctx, vocab);
-        if(!res)
+        ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);       
+        if(res==ModelLoadResult::FAIL)
        {
            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
-            return ModelLoadResult::FAIL;
+            return res;
+        }
+        else if(res==ModelLoadResult::RETRY_LOAD)
+        {
+            printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
+            return res;
        }
         // determine the required inference memory per token:    
        stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);    
@ -245,8 +250,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
        }

        // determine the required inference memory per token:    
-        gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); 
-
+        gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
      
        //if the logits are NAN, it means the model is incompatible
        if(logits.size()>0 && IsNanCheck(logits[0]))
--- a/koboldcpp.py
+++ b/koboldcpp.py
@ -148,7 +148,7 @@ maxctx = 2048
 maxlen = 128
 modelbusy = False
 defaultport = 5001
-KcppVersion = "1.13.1"
+KcppVersion = "1.14"

 class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
    sys_version = ""
--- a/model_adapter.cpp
+++ b/model_adapter.cpp
@ -130,7 +130,7 @@ void print_tok_vec(std::vector<float> &embd)
       else if(vocabsiz < 31998 || vocabsiz > 33000)
       {
           //anything outside the llama v1 range is assumed to be NeoX
-           fileformat = FileFormat::NEOX_1;
+           fileformat = FileFormat::NEOX_2;
       }
    }
    else if(magic == 0x67676d66) //v2 format ggmf
--- a/model_adapter.h
+++ b/model_adapter.h
@ -30,6 +30,7 @@ enum FileFormat
    RWKV_1=300,

    NEOX_1=400,
+    NEOX_2=401,
 };

 enum ModelLoadResult
--- a/otherarch/neox.cpp
+++ b/otherarch/neox.cpp
@ -17,13 +17,13 @@


 // load the model's weights from a file
-bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab) {
+ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
+        return ModelLoadResult::FAIL;
    }

    // verify magic
@ -32,7 +32,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
        fin.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6c) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return ModelLoadResult::FAIL;
        }
    }

@ -88,7 +88,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
                {
                    fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                            __func__, fname.c_str(), model.hparams.ftype);
-                    return false;
+                    return ModelLoadResult::FAIL;
                }
    }

@ -151,7 +151,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
        model.ctx = ggml_init(params);
        if (!model.ctx) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
+            return ModelLoadResult::FAIL;
        }
    }

@ -276,19 +276,19 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_

            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
+                return ModelLoadResult::FAIL;
            }

            auto tensor = model.tensors[name.data()];
            if (ggml_nelements(tensor) != nelements) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
+                return ModelLoadResult::FAIL;
            }

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return false;
+                return ModelLoadResult::FAIL;
            }

            // for debugging
@ -296,12 +296,29 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if(file_format==FileFormat::NEOX_1)
+            {
+                switch (ttype) {
+                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
+                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
+                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
+                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                    case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                    case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
+                    default:
+                    {
+                        fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
+                        return ModelLoadResult::FAIL;
+                    }
+                };
+            }

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
+                 return ModelLoadResult::RETRY_LOAD;
            }

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
@ -320,7 +337,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_

    fin.close();

-    return true;
+    return ModelLoadResult::SUCCESS;
 }

 // evaluate the transformer