diff --git a/Makefile b/Makefile index 6fd26679e..4611f6881 100644 --- a/Makefile +++ b/Makefile @@ -238,8 +238,8 @@ ggml.o: ggml.c ggml.h ggml_blas.o: ggml.c ggml.h $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_blas.o -ggml_old_v1.o: otherarch/ggml_old.c otherarch/ggml_old.h - $(CC) $(CFLAGS) -c otherarch/ggml_old.c -o ggml_old_v1.o +ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h + $(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o llama.o: llama.cpp llama.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o @@ -256,8 +256,8 @@ expose.o: expose.cpp expose.h llama_adapter.o: $(CXX) $(CXXFLAGS) -c llama_adapter.cpp -o llama_adapter.o -gptj_adapter.o: ggml_old_v1.o - $(CXX) $(CXXFLAGS) otherarch/gptj_old.cpp otherarch/utils.cpp ggml_old_v1.o gptj_adapter.cpp -o gptj_adapter.o +gptj_adapter.o: ggml.o + $(CXX) $(CXXFLAGS) otherarch/gptj.cpp otherarch/utils.cpp ggml.o gptj_adapter.cpp -o gptj_adapter.o clean: rm -vf *.o main quantize perplexity embedding main.exe quantize.exe llamacpp.dll llamacpp_blas.dll gpt2.exe gptj.exe @@ -268,8 +268,11 @@ main: examples/main/main.cpp ggml.o llama.o common.o @echo '==== Run ./main -h for help. ====' @echo -gptj: ggml_old_v1.o - $(CXX) $(CXXFLAGS) otherarch/gptj_old.cpp otherarch/utils.cpp ggml_old_v1.o -o gptj $(LDFLAGS) +gptj: ggml.o + $(CXX) $(CXXFLAGS) otherarch/gptj.cpp otherarch/utils.cpp ggml.o -o gptj $(LDFLAGS) + +gptjold: ggml_v1.o + $(CXX) $(CXXFLAGS) otherarch/gptj_old.cpp otherarch/utils.cpp ggml_v1.o -o gptj $(LDFLAGS) llamalib: ggml.o expose.o llama_adapter.o llamaextra.o common.o diff --git a/gptj_adapter.cpp b/gptj_adapter.cpp index 9498855d5..1a42d4cbf 100644 --- a/gptj_adapter.cpp +++ b/gptj_adapter.cpp @@ -35,7 +35,7 @@ bool gptj_load_model(const load_model_inputs inputs, FileFormat in_file_format) n_batch = params.n_batch = inputs.batch_size; modelname = params.model = inputs.model_filename; - if (!legacy_gptj_model_load(params.model, model, vocab)) { + if (!gptj_model_load(params.model, model, vocab)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); return false; } @@ -151,7 +151,7 @@ generation_outputs gptj_generate(const generation_inputs inputs, generation_outp printf("\rGenerating (%d / %d tokens)", (1 + params.n_predict - remaining_tokens), params.n_predict); } - if (!legacy_gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) + if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { fprintf(stderr, "Failed to predict\n"); snprintf(output.text, sizeof(output.text), "%s", ""); diff --git a/llamacpp.dll b/llamacpp.dll index 4a6b9702a..9936ef2cb 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/llamacpp_blas.dll b/llamacpp_blas.dll index 78f91df4a..8ac9ee21d 100644 Binary files a/llamacpp_blas.dll and b/llamacpp_blas.dll differ diff --git a/llamaextra.cpp b/llamaextra.cpp index 40ada56bd..2407bf53e 100644 --- a/llamaextra.cpp +++ b/llamaextra.cpp @@ -272,7 +272,13 @@ void print_tok_vec(std::vector &embd) vocab.id_to_token.resize(model.hparams.n_vocab); std::vector tmp(64); - for (int i = 0; i < model.hparams.n_vocab; i++) { + int32_t vocabloops = model.hparams.n_vocab; + if(vocabloops==32001 && legacy_file_format) + { + printf("---\n!! WARNING: Model appears to be GPT4ALL v1 model, triggering compatibility fix !!\n---\n"); + vocabloops -= 1; + } + for (int i = 0; i < vocabloops; i++) { uint32_t len; fin.read((char *) &len, sizeof(len)); diff --git a/otherarch/ggml_old.c b/otherarch/ggml_v1.c similarity index 99% rename from otherarch/ggml_old.c rename to otherarch/ggml_v1.c index 186c19a58..a5bbff566 100644 --- a/otherarch/ggml_old.c +++ b/otherarch/ggml_v1.c @@ -1,4 +1,4 @@ -#include "ggml_old.h" +#include "ggml_v1.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW diff --git a/otherarch/ggml_old.h b/otherarch/ggml_v1.h similarity index 100% rename from otherarch/ggml_old.h rename to otherarch/ggml_v1.h diff --git a/otherarch/gptj.cpp b/otherarch/gptj.cpp index 19f185afd..ffb68153d 100644 --- a/otherarch/gptj.cpp +++ b/otherarch/gptj.cpp @@ -14,7 +14,19 @@ #include #include - +bool should_transpose_layer(std::string name) +{ + + if(name.find(".mlp.fc_in.weight")!=std::string::npos || + name.find(".attn.out_proj.weight")!=std::string::npos || + name.find(".attn.q_proj.weight")!=std::string::npos || + name.find(".attn.k_proj.weight")!=std::string::npos || + name.find(".attn.v_proj.weight")!=std::string::npos) + { + return true; + } + return false; +} // load the model's weights from a file bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) { @@ -139,6 +151,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & ctx_size += (5 + 10*n_layer)*256; // object overhead + ctx_size = ctx_size * 3 / 2; printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } @@ -279,6 +292,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); return false; } + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", @@ -312,7 +326,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { diff --git a/otherarch/gptj_old.cpp b/otherarch/gptj_old.cpp index b761a0725..ef4e5f9c6 100644 --- a/otherarch/gptj_old.cpp +++ b/otherarch/gptj_old.cpp @@ -1,4 +1,4 @@ -#include "ggml_old.h" +#include "ggml_v1.h" #include "otherarch.h" #include "utils.h" diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index ac7cc425a..71ad4269a 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -72,3 +72,5 @@ struct gptj_model { bool legacy_gptj_model_load(const std::string &fname, gptj_model &model, gpt_vocab &vocab); bool legacy_gptj_eval(const gptj_model &model, const int n_threads, const int n_past, const std::vector &embd_inp, std::vector &embd_w, size_t &mem_per_token); +bool gptj_model_load(const std::string &fname, gptj_model &model, gpt_vocab &vocab); +bool gptj_eval(const gptj_model &model, const int n_threads, const int n_past, const std::vector &embd_inp, std::vector &embd_w, size_t &mem_per_token);