This commit is contained in:
Concedo 2023-05-14 17:21:07 +08:00
parent e01e373e63
commit b692e4d2a4
8 changed files with 99 additions and 25 deletions

View file

@ -63,7 +63,7 @@ extern "C"
putenv((char*)deviceenv.c_str()); putenv((char*)deviceenv.c_str());
executable_path = inputs.executable_path; executable_path = inputs.executable_path;
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3) if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
{ {
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format);
@ -73,11 +73,18 @@ extern "C"
{ {
//if we tried 1 first, then try 3 and lastly 2 //if we tried 1 first, then try 3 and lastly 2
//otherwise if we tried 3 first, then try 2 //otherwise if we tried 3 first, then try 2
file_format = FileFormat::GPTJ_3; file_format = FileFormat::GPTJ_4;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format); lr = gpttype_load_model(inputs, file_format);
} }
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPTJ_3;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
//lastly try format 2 //lastly try format 2
if (lr == ModelLoadResult::RETRY_LOAD) if (lr == ModelLoadResult::RETRY_LOAD)
{ {
@ -96,11 +103,17 @@ extern "C"
return true; return true;
} }
} }
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2) else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3)
{ {
printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format);
if (lr == ModelLoadResult::RETRY_LOAD) if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPT2_3;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
if (lr == ModelLoadResult::RETRY_LOAD)
{ {
file_format = FileFormat::GPT2_2; file_format = FileFormat::GPT2_2;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format); printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
@ -128,15 +141,24 @@ extern "C"
return true; return true;
} }
} }
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{ {
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format);
if (lr == ModelLoadResult::RETRY_LOAD) if (lr == ModelLoadResult::RETRY_LOAD)
{ {
file_format = FileFormat::NEOX_3; if(file_format==FileFormat::NEOX_2)
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); {
lr = gpttype_load_model(inputs, file_format); file_format = FileFormat::NEOX_3;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
else
{
file_format = FileFormat::NEOX_5;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
} }
if (lr == ModelLoadResult::RETRY_LOAD) if (lr == ModelLoadResult::RETRY_LOAD)
{ {

2
ggml.h
View file

@ -190,6 +190,8 @@
#define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1 #define GGML_FILE_VERSION 1
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096 #define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 256 #define GGML_MAX_PARAMS 256

View file

@ -329,8 +329,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS; return ModelLoadResult::SUCCESS;
} }
else if (file_format == FileFormat::GPT2_2) else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{ {
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format); ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format);
if(res==ModelLoadResult::FAIL) if(res==ModelLoadResult::FAIL)
{ {
@ -372,7 +375,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return ModelLoadResult::SUCCESS; return ModelLoadResult::SUCCESS;
} }
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3) else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{ {
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format); ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
if(res==ModelLoadResult::FAIL) if(res==ModelLoadResult::FAIL)
@ -385,14 +388,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading..."); printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
return res; return res;
} }
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
// determine the required inference memory per token: // determine the required inference memory per token:
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0])) if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0]))
{ {
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK! //run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3); stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));
int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
std::string predicted = vocab.id_to_token[topid].c_str(); std::string predicted = vocab.id_to_token[topid].c_str();
if(predicted.find("8") != std::string::npos) if(predicted.find("8") != std::string::npos)
@ -407,6 +414,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
} }
else else
{ {
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab); ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab);
if (loadresult == ModelLoadResult::FAIL) if (loadresult == ModelLoadResult::FAIL)
{ {
@ -584,7 +594,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
n_vocab = gptj_ctx_v1.hparams.n_vocab; n_vocab = gptj_ctx_v1.hparams.n_vocab;
} }
else if(file_format == FileFormat::GPTJ_3) else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
{ {
n_vocab = gptj_ctx_v2.hparams.n_vocab; n_vocab = gptj_ctx_v2.hparams.n_vocab;
} }
@ -592,11 +602,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
n_vocab = gpt2_ctx_v1.hparams.n_vocab; n_vocab = gpt2_ctx_v1.hparams.n_vocab;
} }
else if(file_format == FileFormat::GPT2_2) else if(file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{ {
n_vocab = gpt2_ctx_v2.hparams.n_vocab; n_vocab = gpt2_ctx_v2.hparams.n_vocab;
} }
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{ {
n_vocab = neox_ctx.hparams.n_vocab; n_vocab = neox_ctx.hparams.n_vocab;
} }
@ -678,11 +688,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{ {
evalres = legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format); evalres = legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
} }
else if(file_format==FileFormat::GPT2_2) else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{ {
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
} }
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3) else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{ {
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format); evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
} }
@ -750,9 +760,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
// set the logit of the eos token (2) to zero to avoid sampling it // set the logit of the eos token (2) to zero to avoid sampling it
if ((file_format == FileFormat::GPT2_1 || if ((file_format == FileFormat::GPT2_1 ||
file_format == FileFormat::GPT2_2 || file_format == FileFormat::GPT2_2 ||
file_format == FileFormat::GPT2_3 ||
file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_1 ||
file_format == FileFormat::GPTJ_2 || file_format == FileFormat::GPTJ_2 ||
file_format == FileFormat::GPTJ_3) && file_format == FileFormat::GPTJ_3 ||
file_format == FileFormat::GPTJ_4) &&
logits.size() > 50256) logits.size() > 50256)
{ {
logits[50256] = (logits[50256] < 0 ? logits[50256] : 0); logits[50256] = (logits[50256] < 0 ? logits[50256] : 0);

View file

@ -108,9 +108,15 @@ void print_tok_vec(std::vector<float> &embd)
fin.read((char *)&temp, sizeof(temp)); //n_layer fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //f16 fin.read((char *)&temp, sizeof(temp)); //f16
if(temp!=0 && temp!=1) const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{ {
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type fileformat = FileFormat::GPTJ_4;
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
} }
} }
else if(vocabsiz==50257) else if(vocabsiz==50257)
@ -122,15 +128,34 @@ void print_tok_vec(std::vector<float> &embd)
fin.read((char *)&temp, sizeof(temp)); //n_head fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //f16 fin.read((char *)&temp, sizeof(temp)); //f16
if(temp!=0 && temp!=1) const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{ {
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type fileformat = FileFormat::GPT2_3;
} }
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
}
} }
else if(vocabsiz < 31998 || vocabsiz > 33000) else if(vocabsiz < 31998 || vocabsiz > 33000)
{ {
//anything outside the llama v1 range is assumed to be NeoX //anything outside the llama v1 range is assumed to be NeoX
fileformat = FileFormat::NEOX_2; fileformat = FileFormat::NEOX_4;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if(qntvr==0)
{
fileformat = FileFormat::NEOX_2;
}
} }
} }
else if(magic == 0x67676d66) //v2 format ggmf else if(magic == 0x67676d66) //v2 format ggmf

View file

@ -19,20 +19,24 @@ enum FileFormat
GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header) GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
GGHF=2, // 2=(llama ggmf) GGHF=2, // 2=(llama ggmf)
GGJT=3, // 3=(llama ggjt) GGJT=3, // 3=(llama ggjt)
GGJT_2=4, //newer llama format GGJT_2=4, //newer llama format unshuffled
GPTJ_1=100, //the very first super old GPTJ format GPTJ_1=100, //the very first super old GPTJ format
GPTJ_2=101, //pygmalion, uses old ggml lib GPTJ_2=101, //pygmalion, uses old ggml lib
GPTJ_3=102, //uses new ggml lib GPTJ_3=102, //uses new ggml lib
GPTJ_4=103, //unshuffled
GPT2_1=200, GPT2_1=200,
GPT2_2=201, GPT2_2=201,
GPT2_3=202, //unshuffled
RWKV_1=300, RWKV_1=300,
NEOX_1=400, NEOX_1=400,
NEOX_2=401, NEOX_2=401,
NEOX_3=402, NEOX_3=402, //redpajama
NEOX_4=403, //unshuffled
NEOX_5=404, //unshuffled redpajama
}; };
enum ModelLoadResult enum ModelLoadResult

View file

@ -50,6 +50,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);

View file

@ -51,6 +51,9 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);

View file

@ -49,6 +49,9 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);