Added backwards compatibility to an earlier version of NeoX.
This commit is contained in:
parent
bff998f871
commit
5eec5d6ed9
6 changed files with 49 additions and 21 deletions
10
expose.cpp
10
expose.cpp
|
@ -117,10 +117,16 @@ extern "C"
|
|||
return true;
|
||||
}
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
if (lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
file_format = FileFormat::NEOX_1;
|
||||
printf("\n---\nRetrying as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
lr = gpttype_load_model(inputs, file_format);
|
||||
}
|
||||
if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
return false;
|
||||
|
|
|
@ -218,13 +218,18 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2)
|
||||
{
|
||||
bool res = stablelm_model_load(params.model, neox_ctx, vocab);
|
||||
if(!res)
|
||||
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
|
||||
return res;
|
||||
}
|
||||
// determine the required inference memory per token:
|
||||
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
@ -245,8 +250,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
//if the logits are NAN, it means the model is incompatible
|
||||
if(logits.size()>0 && IsNanCheck(logits[0]))
|
||||
|
|
|
@ -148,7 +148,7 @@ maxctx = 2048
|
|||
maxlen = 128
|
||||
modelbusy = False
|
||||
defaultport = 5001
|
||||
KcppVersion = "1.13.1"
|
||||
KcppVersion = "1.14"
|
||||
|
||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
sys_version = ""
|
||||
|
|
|
@ -130,7 +130,7 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
else if(vocabsiz < 31998 || vocabsiz > 33000)
|
||||
{
|
||||
//anything outside the llama v1 range is assumed to be NeoX
|
||||
fileformat = FileFormat::NEOX_1;
|
||||
fileformat = FileFormat::NEOX_2;
|
||||
}
|
||||
}
|
||||
else if(magic == 0x67676d66) //v2 format ggmf
|
||||
|
|
|
@ -30,6 +30,7 @@ enum FileFormat
|
|||
RWKV_1=300,
|
||||
|
||||
NEOX_1=400,
|
||||
NEOX_2=401,
|
||||
};
|
||||
|
||||
enum ModelLoadResult
|
||||
|
|
|
@ -17,13 +17,13 @@
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab) {
|
||||
ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
|
@ -32,7 +32,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -88,7 +88,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
{
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -151,7 +151,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -276,19 +276,19 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
||||
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
|
@ -296,12 +296,29 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
|
||||
if(file_format==FileFormat::NEOX_1)
|
||||
{
|
||||
switch (ttype) {
|
||||
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
||||
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
||||
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
||||
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
||||
case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
|
||||
case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return false;
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
@ -320,7 +337,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
|
|||
|
||||
fin.close();
|
||||
|
||||
return true;
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue