This commit is contained in:
Concedo 2023-05-14 17:21:07 +08:00
parent e01e373e63
commit b692e4d2a4
8 changed files with 99 additions and 25 deletions

View file

@ -63,7 +63,7 @@ extern "C"
putenv((char*)deviceenv.c_str());
executable_path = inputs.executable_path;
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3)
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
{
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
@ -73,6 +73,13 @@ extern "C"
{
//if we tried 1 first, then try 3 and lastly 2
//otherwise if we tried 3 first, then try 2
file_format = FileFormat::GPTJ_4;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPTJ_3;
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
@ -96,11 +103,17 @@ extern "C"
return true;
}
}
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2)
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3)
{
printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPT2_3;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::GPT2_2;
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
@ -128,16 +141,25 @@ extern "C"
return true;
}
}
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
if (lr == ModelLoadResult::RETRY_LOAD)
{
if(file_format==FileFormat::NEOX_2)
{
file_format = FileFormat::NEOX_3;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
else
{
file_format = FileFormat::NEOX_5;
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
lr = gpttype_load_model(inputs, file_format);
}
}
if (lr == ModelLoadResult::RETRY_LOAD)
{
file_format = FileFormat::NEOX_1;

2
ggml.h
View file

@ -190,6 +190,8 @@
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 256

View file

@ -329,8 +329,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
return ModelLoadResult::SUCCESS;
}
else if (file_format == FileFormat::GPT2_2)
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format);
if(res==ModelLoadResult::FAIL)
{
@ -372,7 +375,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
return ModelLoadResult::SUCCESS;
}
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
if(res==ModelLoadResult::FAIL)
@ -385,14 +388,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
return res;
}
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
// determine the required inference memory per token:
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0]))
{
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3);
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
std::string predicted = vocab.id_to_token[topid].c_str();
if(predicted.find("8") != std::string::npos)
@ -407,6 +414,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
else
{
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab);
if (loadresult == ModelLoadResult::FAIL)
{
@ -584,7 +594,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
n_vocab = gptj_ctx_v1.hparams.n_vocab;
}
else if(file_format == FileFormat::GPTJ_3)
else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
{
n_vocab = gptj_ctx_v2.hparams.n_vocab;
}
@ -592,11 +602,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
n_vocab = gpt2_ctx_v1.hparams.n_vocab;
}
else if(file_format == FileFormat::GPT2_2)
else if(file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
}
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
n_vocab = neox_ctx.hparams.n_vocab;
}
@ -678,11 +688,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
{
evalres = legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
}
else if(file_format==FileFormat::GPT2_2)
else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
{
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
}
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
}
@ -750,9 +760,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
// set the logit of the eos token (2) to zero to avoid sampling it
if ((file_format == FileFormat::GPT2_1 ||
file_format == FileFormat::GPT2_2 ||
file_format == FileFormat::GPT2_3 ||
file_format == FileFormat::GPTJ_1 ||
file_format == FileFormat::GPTJ_2 ||
file_format == FileFormat::GPTJ_3) &&
file_format == FileFormat::GPTJ_3 ||
file_format == FileFormat::GPTJ_4) &&
logits.size() > 50256)
{
logits[50256] = (logits[50256] < 0 ? logits[50256] : 0);

View file

@ -108,7 +108,13 @@ void print_tok_vec(std::vector<float> &embd)
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //f16
if(temp!=0 && temp!=1)
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{
fileformat = FileFormat::GPTJ_4;
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
}
@ -122,7 +128,13 @@ void print_tok_vec(std::vector<float> &embd)
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //f16
if(temp!=0 && temp!=1)
const int32_t qntvr = temp / 1000;
temp %= 1000;
if (qntvr != 0)
{
fileformat = FileFormat::GPT2_3;
}
else if (temp != 0 && temp != 1)
{
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
}
@ -130,9 +142,22 @@ void print_tok_vec(std::vector<float> &embd)
else if(vocabsiz < 31998 || vocabsiz > 33000)
{
//anything outside the llama v1 range is assumed to be NeoX
fileformat = FileFormat::NEOX_4;
uint32_t temp;
fin.read((char *)&temp, sizeof(temp)); //ctx
fin.read((char *)&temp, sizeof(temp)); //n_embd
fin.read((char *)&temp, sizeof(temp)); //n_head
fin.read((char *)&temp, sizeof(temp)); //n_layer
fin.read((char *)&temp, sizeof(temp)); //n_rot
fin.read((char *)&temp, sizeof(temp)); //f16
const int32_t qntvr = temp / 1000;
temp %= 1000;
if(qntvr==0)
{
fileformat = FileFormat::NEOX_2;
}
}
}
else if(magic == 0x67676d66) //v2 format ggmf
{
fileformat = FileFormat::GGHF;

View file

@ -19,20 +19,24 @@ enum FileFormat
GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
GGHF=2, // 2=(llama ggmf)
GGJT=3, // 3=(llama ggjt)
GGJT_2=4, //newer llama format
GGJT_2=4, //newer llama format unshuffled
GPTJ_1=100, //the very first super old GPTJ format
GPTJ_2=101, //pygmalion, uses old ggml lib
GPTJ_3=102, //uses new ggml lib
GPTJ_4=103, //unshuffled
GPT2_1=200,
GPT2_2=201,
GPT2_3=202, //unshuffled
RWKV_1=300,
NEOX_1=400,
NEOX_2=401,
NEOX_3=402,
NEOX_3=402, //redpajama
NEOX_4=403, //unshuffled
NEOX_5=404, //unshuffled redpajama
};
enum ModelLoadResult

View file

@ -50,6 +50,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);

View file

@ -51,6 +51,9 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);

View file

@ -49,6 +49,9 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);