the dark gods have been sated, and redpajama is integrated... but at what cost?
This commit is contained in:
parent
b9904c3093
commit
2f2eff6e13
4 changed files with 57 additions and 23 deletions
|
@ -128,11 +128,17 @@ extern "C"
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
|
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
|
||||||
{
|
{
|
||||||
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||||
|
{
|
||||||
|
file_format = FileFormat::NEOX_3;
|
||||||
|
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||||
|
lr = gpttype_load_model(inputs, file_format);
|
||||||
|
}
|
||||||
|
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||||
{
|
{
|
||||||
file_format = FileFormat::NEOX_1;
|
file_format = FileFormat::NEOX_1;
|
||||||
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||||
|
|
|
@ -369,7 +369,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
|
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
|
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3)
|
||||||
{
|
{
|
||||||
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
|
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
|
||||||
if(res==ModelLoadResult::FAIL)
|
if(res==ModelLoadResult::FAIL)
|
||||||
|
@ -383,7 +383,23 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
// determine the required inference memory per token:
|
// determine the required inference memory per token:
|
||||||
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||||
|
|
||||||
|
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
|
||||||
|
{
|
||||||
|
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
|
||||||
|
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
|
||||||
|
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, FileFormat::NEOX_3);
|
||||||
|
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
|
||||||
|
std::string predicted = vocab.id_to_token[topid].c_str();
|
||||||
|
if(predicted.find("8") != std::string::npos)
|
||||||
|
{
|
||||||
|
printf("\n---\nRedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
|
||||||
|
ggml_free(neox_ctx.ctx);
|
||||||
|
return ModelLoadResult::RETRY_LOAD;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -514,13 +530,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
}
|
}
|
||||||
|
|
||||||
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
//if using BLAS and prompt is big enough, switch to single thread and use a huge batch
|
||||||
bool approved_format = (file_format == FileFormat::GGML ||
|
bool approved_format = !(file_format == FileFormat::BADFORMAT ||
|
||||||
file_format == FileFormat::GGHF ||
|
file_format == FileFormat::GPT2_1 ||
|
||||||
file_format == FileFormat::GGJT ||
|
file_format == FileFormat::GPTJ_1 ||
|
||||||
file_format == FileFormat::GPT2_2 ||
|
file_format == FileFormat::GPTJ_2 ||
|
||||||
file_format == FileFormat::GPTJ_3 ||
|
file_format == FileFormat::RWKV_1);
|
||||||
file_format == FileFormat::NEOX_1 ||
|
|
||||||
file_format == FileFormat::NEOX_2);
|
|
||||||
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas());
|
||||||
// bool blasmode = false;
|
// bool blasmode = false;
|
||||||
int original_batch = params.n_batch;
|
int original_batch = params.n_batch;
|
||||||
|
@ -579,7 +593,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
|
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
|
||||||
}
|
}
|
||||||
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2)
|
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
|
||||||
{
|
{
|
||||||
n_vocab = neox_ctx.hparams.n_vocab;
|
n_vocab = neox_ctx.hparams.n_vocab;
|
||||||
}
|
}
|
||||||
|
@ -614,14 +628,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
for (auto id : embd_inp)
|
for (auto id : embd_inp)
|
||||||
{
|
{
|
||||||
printf("'%s', ",llama_token_to_str(llama_ctx_v1, id));
|
printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (auto id : embd_inp)
|
for (auto id : embd_inp)
|
||||||
{
|
{
|
||||||
printf("'%s', ",vocab.id_to_token[id].c_str());
|
printf("'%s (%d)', ",vocab.id_to_token[id].c_str(),id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -665,9 +679,9 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
||||||
{
|
{
|
||||||
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2)
|
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3)
|
||||||
{
|
{
|
||||||
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token);
|
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
||||||
{
|
{
|
||||||
|
|
|
@ -31,6 +31,7 @@ enum FileFormat
|
||||||
|
|
||||||
NEOX_1=400,
|
NEOX_1=400,
|
||||||
NEOX_2=401,
|
NEOX_2=401,
|
||||||
|
NEOX_3=402,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum ModelLoadResult
|
enum ModelLoadResult
|
||||||
|
|
|
@ -345,7 +345,8 @@ bool stablelm_eval(
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const std::vector<gpt_vocab::id> & embd_inp,
|
const std::vector<gpt_vocab::id> & embd_inp,
|
||||||
std::vector<float> & embd_w,
|
std::vector<float> & embd_w,
|
||||||
size_t & mem_per_token) {
|
size_t & mem_per_token,
|
||||||
|
FileFormat file_format) {
|
||||||
const int N = embd_inp.size();
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
@ -494,6 +495,12 @@ bool stablelm_eval(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(file_format==FileFormat::NEOX_3)
|
||||||
|
{
|
||||||
|
// layer input + Attn
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * inpFF = cur;
|
struct ggml_tensor * inpFF = cur;
|
||||||
|
|
||||||
// feed-forward network
|
// feed-forward network
|
||||||
|
@ -502,7 +509,7 @@ bool stablelm_eval(
|
||||||
// post attention layer norm
|
// post attention layer norm
|
||||||
// note here we pass inpL instead of cur
|
// note here we pass inpL instead of cur
|
||||||
{
|
{
|
||||||
cur = ggml_norm(ctx0, inpL);
|
cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL));
|
||||||
|
|
||||||
cur = ggml_add(ctx0,
|
cur = ggml_add(ctx0,
|
||||||
ggml_mul(ctx0,
|
ggml_mul(ctx0,
|
||||||
|
@ -533,11 +540,17 @@ bool stablelm_eval(
|
||||||
cur);
|
cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
// layer input + FF
|
if (file_format == FileFormat::NEOX_3)
|
||||||
cur = ggml_add(ctx0, cur, inpFF);
|
{
|
||||||
|
// layer input + FF
|
||||||
// input for next layer
|
inpL = ggml_add(ctx0, cur, inpFF);
|
||||||
inpL = ggml_add(ctx0, cur, inpL);
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
|
// input for next layer
|
||||||
|
inpL = ggml_add(ctx0, cur, inpL);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue