From b692e4d2a4e18f31cbfe5986c0d26415e925d5de Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 14 May 2023 17:21:07 +0800 Subject: [PATCH] wip --- expose.cpp | 36 +++++++++++++++++++++++++++++------- ggml.h | 2 ++ gpttype_adapter.cpp | 32 ++++++++++++++++++++++---------- model_adapter.cpp | 37 +++++++++++++++++++++++++++++++------ model_adapter.h | 8 ++++++-- otherarch/gpt2_v2.cpp | 3 +++ otherarch/gptj_v2.cpp | 3 +++ otherarch/neox.cpp | 3 +++ 8 files changed, 99 insertions(+), 25 deletions(-) diff --git a/expose.cpp b/expose.cpp index df426d82e..22b1ebf61 100644 --- a/expose.cpp +++ b/expose.cpp @@ -63,7 +63,7 @@ extern "C" putenv((char*)deviceenv.c_str()); executable_path = inputs.executable_path; - if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3) + if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4) { printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); @@ -73,11 +73,18 @@ extern "C" { //if we tried 1 first, then try 3 and lastly 2 //otherwise if we tried 3 first, then try 2 - file_format = FileFormat::GPTJ_3; + file_format = FileFormat::GPTJ_4; printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); lr = gpttype_load_model(inputs, file_format); } + if (lr == ModelLoadResult::RETRY_LOAD) + { + file_format = FileFormat::GPTJ_3; + printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + //lastly try format 2 if (lr == ModelLoadResult::RETRY_LOAD) { @@ -96,11 +103,17 @@ extern "C" return true; } } - else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2) + else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3) { printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); if (lr == ModelLoadResult::RETRY_LOAD) + { + file_format = FileFormat::GPT2_3; + printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + if (lr == ModelLoadResult::RETRY_LOAD) { file_format = FileFormat::GPT2_2; printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); @@ -128,15 +141,24 @@ extern "C" return true; } } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); if (lr == ModelLoadResult::RETRY_LOAD) { - file_format = FileFormat::NEOX_3; - printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); + if(file_format==FileFormat::NEOX_2) + { + file_format = FileFormat::NEOX_3; + printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + else + { + file_format = FileFormat::NEOX_5; + printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } } if (lr == ModelLoadResult::RETRY_LOAD) { diff --git a/ggml.h b/ggml.h index 391a0313e..9f8cba71c 100644 --- a/ggml.h +++ b/ggml.h @@ -190,6 +190,8 @@ #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + #define GGML_MAX_DIMS 4 #define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 256 diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index cbbd2e7df..89c525da8 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -329,8 +329,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); return ModelLoadResult::SUCCESS; } - else if (file_format == FileFormat::GPT2_2) + else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) { + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format == FileFormat::GPT2_3); + ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format); if(res==ModelLoadResult::FAIL) { @@ -372,7 +375,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return ModelLoadResult::SUCCESS; } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format); if(res==ModelLoadResult::FAIL) @@ -385,14 +388,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading..."); return res; } + + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5); + // determine the required inference memory per token: stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); - if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0])) + if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0])) { //run the black magic eval to determine if it's redpajama. VERY UGLY HACK! std::vector test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); - stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3); + stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5)); int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); std::string predicted = vocab.id_to_token[topid].c_str(); if(predicted.find("8") != std::string::npos) @@ -407,6 +414,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4); + ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab); if (loadresult == ModelLoadResult::FAIL) { @@ -584,7 +594,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gptj_ctx_v1.hparams.n_vocab; } - else if(file_format == FileFormat::GPTJ_3) + else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4) { n_vocab = gptj_ctx_v2.hparams.n_vocab; } @@ -592,11 +602,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gpt2_ctx_v1.hparams.n_vocab; } - else if(file_format == FileFormat::GPT2_2) + else if(file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) { n_vocab = gpt2_ctx_v2.hparams.n_vocab; } - else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) + else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { n_vocab = neox_ctx.hparams.n_vocab; } @@ -678,11 +688,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { evalres = legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } - else if(file_format==FileFormat::GPT2_2) + else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) { evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } - else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) + else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } @@ -750,9 +760,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o // set the logit of the eos token (2) to zero to avoid sampling it if ((file_format == FileFormat::GPT2_1 || file_format == FileFormat::GPT2_2 || + file_format == FileFormat::GPT2_3 || file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2 || - file_format == FileFormat::GPTJ_3) && + file_format == FileFormat::GPTJ_3 || + file_format == FileFormat::GPTJ_4) && logits.size() > 50256) { logits[50256] = (logits[50256] < 0 ? logits[50256] : 0); diff --git a/model_adapter.cpp b/model_adapter.cpp index 487feb043..13e6b4770 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -108,9 +108,15 @@ void print_tok_vec(std::vector &embd) fin.read((char *)&temp, sizeof(temp)); //n_layer fin.read((char *)&temp, sizeof(temp)); //n_rot fin.read((char *)&temp, sizeof(temp)); //f16 - if(temp!=0 && temp!=1) + const int32_t qntvr = temp / 1000; + temp %= 1000; + if (qntvr != 0) { - fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type + fileformat = FileFormat::GPTJ_4; + } + else if (temp != 0 && temp != 1) + { + fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type } } else if(vocabsiz==50257) @@ -122,15 +128,34 @@ void print_tok_vec(std::vector &embd) fin.read((char *)&temp, sizeof(temp)); //n_head fin.read((char *)&temp, sizeof(temp)); //n_layer fin.read((char *)&temp, sizeof(temp)); //f16 - if(temp!=0 && temp!=1) + const int32_t qntvr = temp / 1000; + temp %= 1000; + if (qntvr != 0) { - fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type - } + fileformat = FileFormat::GPT2_3; + } + else if (temp != 0 && temp != 1) + { + fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type + } } else if(vocabsiz < 31998 || vocabsiz > 33000) { //anything outside the llama v1 range is assumed to be NeoX - fileformat = FileFormat::NEOX_2; + fileformat = FileFormat::NEOX_4; + uint32_t temp; + fin.read((char *)&temp, sizeof(temp)); //ctx + fin.read((char *)&temp, sizeof(temp)); //n_embd + fin.read((char *)&temp, sizeof(temp)); //n_head + fin.read((char *)&temp, sizeof(temp)); //n_layer + fin.read((char *)&temp, sizeof(temp)); //n_rot + fin.read((char *)&temp, sizeof(temp)); //f16 + const int32_t qntvr = temp / 1000; + temp %= 1000; + if(qntvr==0) + { + fileformat = FileFormat::NEOX_2; + } } } else if(magic == 0x67676d66) //v2 format ggmf diff --git a/model_adapter.h b/model_adapter.h index d151831e7..c6eb41582 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -19,20 +19,24 @@ enum FileFormat GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header) GGHF=2, // 2=(llama ggmf) GGJT=3, // 3=(llama ggjt) - GGJT_2=4, //newer llama format + GGJT_2=4, //newer llama format unshuffled GPTJ_1=100, //the very first super old GPTJ format GPTJ_2=101, //pygmalion, uses old ggml lib GPTJ_3=102, //uses new ggml lib + GPTJ_4=103, //unshuffled GPT2_1=200, GPT2_2=201, + GPT2_3=202, //unshuffled RWKV_1=300, NEOX_1=400, NEOX_2=401, - NEOX_3=402, + NEOX_3=402, //redpajama + NEOX_4=403, //unshuffled + NEOX_5=404, //unshuffled redpajama }; enum ModelLoadResult diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index bb2310206..e30c4e0d4 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -50,6 +50,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 641c9edde..90570c62a 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -51,6 +51,9 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index 36983eae4..acbd5cd6e 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -49,6 +49,9 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);