wip
This commit is contained in:
parent
e01e373e63
commit
b692e4d2a4
8 changed files with 99 additions and 25 deletions
28
expose.cpp
28
expose.cpp
|
@ -63,7 +63,7 @@ extern "C"
|
|||
putenv((char*)deviceenv.c_str());
|
||||
executable_path = inputs.executable_path;
|
||||
|
||||
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3)
|
||||
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
|
@ -73,6 +73,13 @@ extern "C"
|
|||
{
|
||||
//if we tried 1 first, then try 3 and lastly 2
|
||||
//otherwise if we tried 3 first, then try 2
|
||||
file_format = FileFormat::GPTJ_4;
|
||||
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
}
|
||||
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
file_format = FileFormat::GPTJ_3;
|
||||
printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
|
@ -96,11 +103,17 @@ extern "C"
|
|||
return true;
|
||||
}
|
||||
}
|
||||
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2)
|
||||
else if(file_format==FileFormat::GPT2_1||file_format==FileFormat::GPT2_2||file_format==FileFormat::GPT2_3)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
file_format = FileFormat::GPT2_3;
|
||||
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
}
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
file_format = FileFormat::GPT2_2;
|
||||
printf("\n---\nRetrying as GPT-2 model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
|
@ -128,16 +141,25 @@ extern "C"
|
|||
return true;
|
||||
}
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
if(file_format==FileFormat::NEOX_2)
|
||||
{
|
||||
file_format = FileFormat::NEOX_3;
|
||||
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
}
|
||||
else
|
||||
{
|
||||
file_format = FileFormat::NEOX_5;
|
||||
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
}
|
||||
}
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
file_format = FileFormat::NEOX_1;
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -190,6 +190,8 @@
|
|||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||
#define GGML_FILE_VERSION 1
|
||||
|
||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 256
|
||||
|
|
|
@ -329,8 +329,11 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if (file_format == FileFormat::GPT2_2)
|
||||
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
|
||||
|
||||
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
|
@ -372,7 +375,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
|
@ -385,14 +388,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
|
||||
return res;
|
||||
}
|
||||
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
|
||||
|
||||
// determine the required inference memory per token:
|
||||
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
|
||||
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
|
||||
if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0]))
|
||||
{
|
||||
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
|
||||
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
|
||||
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3);
|
||||
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));
|
||||
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
|
||||
std::string predicted = vocab.id_to_token[topid].c_str();
|
||||
if(predicted.find("8") != std::string::npos)
|
||||
|
@ -407,6 +414,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
}
|
||||
else
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
|
||||
|
||||
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab);
|
||||
if (loadresult == ModelLoadResult::FAIL)
|
||||
{
|
||||
|
@ -584,7 +594,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
n_vocab = gptj_ctx_v1.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::GPTJ_3)
|
||||
else if(file_format == FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
||||
{
|
||||
n_vocab = gptj_ctx_v2.hparams.n_vocab;
|
||||
}
|
||||
|
@ -592,11 +602,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
n_vocab = gpt2_ctx_v1.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::GPT2_2)
|
||||
else if(file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||
{
|
||||
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
|
||||
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
n_vocab = neox_ctx.hparams.n_vocab;
|
||||
}
|
||||
|
@ -678,11 +688,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
evalres = legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
else if(file_format==FileFormat::GPT2_2)
|
||||
else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||
{
|
||||
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
|
@ -750,9 +760,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
// set the logit of the eos token (2) to zero to avoid sampling it
|
||||
if ((file_format == FileFormat::GPT2_1 ||
|
||||
file_format == FileFormat::GPT2_2 ||
|
||||
file_format == FileFormat::GPT2_3 ||
|
||||
file_format == FileFormat::GPTJ_1 ||
|
||||
file_format == FileFormat::GPTJ_2 ||
|
||||
file_format == FileFormat::GPTJ_3) &&
|
||||
file_format == FileFormat::GPTJ_3 ||
|
||||
file_format == FileFormat::GPTJ_4) &&
|
||||
logits.size() > 50256)
|
||||
{
|
||||
logits[50256] = (logits[50256] < 0 ? logits[50256] : 0);
|
||||
|
|
|
@ -108,7 +108,13 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_rot
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
if(temp!=0 && temp!=1)
|
||||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_4;
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_3; //quantized format cannot be legacy type
|
||||
}
|
||||
|
@ -122,7 +128,13 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
if(temp!=0 && temp!=1)
|
||||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_3;
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_2; //quantized format cannot be legacy type
|
||||
}
|
||||
|
@ -130,9 +142,22 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
else if(vocabsiz < 31998 || vocabsiz > 33000)
|
||||
{
|
||||
//anything outside the llama v1 range is assumed to be NeoX
|
||||
fileformat = FileFormat::NEOX_4;
|
||||
uint32_t temp;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_head
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_layer
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_rot
|
||||
fin.read((char *)&temp, sizeof(temp)); //f16
|
||||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if(qntvr==0)
|
||||
{
|
||||
fileformat = FileFormat::NEOX_2;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if(magic == 0x67676d66) //v2 format ggmf
|
||||
{
|
||||
fileformat = FileFormat::GGHF;
|
||||
|
|
|
@ -19,20 +19,24 @@ enum FileFormat
|
|||
GGML=1, // 1=(original llama ggml, alpaca, GPT4ALL, GPTJ header)
|
||||
GGHF=2, // 2=(llama ggmf)
|
||||
GGJT=3, // 3=(llama ggjt)
|
||||
GGJT_2=4, //newer llama format
|
||||
GGJT_2=4, //newer llama format unshuffled
|
||||
|
||||
GPTJ_1=100, //the very first super old GPTJ format
|
||||
GPTJ_2=101, //pygmalion, uses old ggml lib
|
||||
GPTJ_3=102, //uses new ggml lib
|
||||
GPTJ_4=103, //unshuffled
|
||||
|
||||
GPT2_1=200,
|
||||
GPT2_2=201,
|
||||
GPT2_3=202, //unshuffled
|
||||
|
||||
RWKV_1=300,
|
||||
|
||||
NEOX_1=400,
|
||||
NEOX_2=401,
|
||||
NEOX_3=402,
|
||||
NEOX_3=402, //redpajama
|
||||
NEOX_4=403, //unshuffled
|
||||
NEOX_5=404, //unshuffled redpajama
|
||||
};
|
||||
|
||||
enum ModelLoadResult
|
||||
|
|
|
@ -50,6 +50,9 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
|
|
|
@ -51,6 +51,9 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
|
|
|
@ -49,6 +49,9 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
|
|||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue