From 90fe9096b4ec1eeeed09bb84ac552d76d3d1549f Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 17 May 2023 11:23:29 +0800 Subject: [PATCH] clean and refactoring pass before supporting newer models for different arch --- gpttype_adapter.cpp | 11 ++- otherarch/gpt2_v2.cpp | 1 - otherarch/gptj_v2.cpp | 1 - otherarch/neox.cpp | 13 ++- otherarch/otherarch.h | 10 +-- otherarch/tools/common-ggml.cpp | 36 +++----- otherarch/tools/common-ggml.h | 4 +- otherarch/tools/gpt2_quantize.cpp | 24 +++-- otherarch/tools/gptj_quantize.cpp | 24 +++-- otherarch/tools/gptj_v1_main.cpp | 145 ------------------------------ otherarch/tools/gptj_v2_main.cpp | 145 ------------------------------ otherarch/tools/neox_quantize.cpp | 34 ++++--- 12 files changed, 81 insertions(+), 367 deletions(-) delete mode 100644 otherarch/tools/gptj_v1_main.cpp delete mode 100644 otherarch/tools/gptj_v2_main.cpp diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index f78f3aa99..dae43a685 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -15,7 +15,6 @@ #include "llamaextra.cpp" //concat source files into one file for compilation purposes -#include "common-ggml.cpp" #include "utils.cpp" #include "gptj_v1.cpp" #include "gptj_v2.cpp" @@ -33,7 +32,7 @@ static gptj_model_v1 gptj_ctx_v1; static gptj_model gptj_ctx_v2; static gpt2_v1_model gpt2_ctx_v1; static gpt2_model gpt2_ctx_v2; -static stablelm_model neox_ctx; +static gpt_neox_model neox_ctx; static rwkv_context * rwkv_ctx_v1; static llama_context_params llama_ctx_params; static llama_context * llama_ctx_v1; @@ -378,7 +377,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { - ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format); + ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format); if(res==ModelLoadResult::FAIL) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); @@ -394,13 +393,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5); // determine the required inference memory per token: - stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0])) { //run the black magic eval to determine if it's redpajama. VERY UGLY HACK! std::vector test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); - stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5)); + gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5)); int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); std::string predicted = vocab.id_to_token[topid].c_str(); if(predicted.find("8") != std::string::npos) @@ -695,7 +694,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { - evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format); + evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2) { diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 4da8b1921..96f4bf215 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -2,7 +2,6 @@ #include "otherarch.h" #include "utils.h" -#include "common-ggml.h" #include #include diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 0fc1e16c0..ccfd85897 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -2,7 +2,6 @@ #include "otherarch.h" #include "utils.h" -#include "common-ggml.h" #include #include diff --git a/otherarch/neox.cpp b/otherarch/neox.cpp index c37a24e43..5b21a6e5e 100644 --- a/otherarch/neox.cpp +++ b/otherarch/neox.cpp @@ -2,7 +2,6 @@ #include "otherarch.h" #include "utils.h" -#include "common-ggml.h" #include #include @@ -17,7 +16,7 @@ // load the model's weights from a file -ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) { +ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -340,8 +339,8 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & // - embd_inp: the embeddings of the tokens in the context // - embd_w: the predicted logits for the next token // -bool stablelm_eval( - const stablelm_model & model, +bool gpt_neox_eval( + const gpt_neox_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, @@ -497,7 +496,7 @@ bool stablelm_eval( } } - if(file_format==FileFormat::NEOX_3) + if(file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5) { // layer input + Attn cur = ggml_add(ctx0, cur, inpL); @@ -511,7 +510,7 @@ bool stablelm_eval( // post attention layer norm // note here we pass inpL instead of cur { - cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL)); + cur = ggml_norm(ctx0, ((file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)?cur:inpL)); cur = ggml_add(ctx0, ggml_mul(ctx0, @@ -542,7 +541,7 @@ bool stablelm_eval( cur); } - if (file_format == FileFormat::NEOX_3) + if (file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5) { // layer input + FF inpL = ggml_add(ctx0, cur, inpFF); diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index f194895e3..ed5927643 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -214,7 +214,7 @@ struct gpt2_model { }; // default hparams (StableLM 3B) -struct stablelm_hparams { +struct gpt_neox_hparams { int32_t n_vocab = 50257; int32_t n_ctx = 4096; int32_t n_embd = 4096; @@ -224,7 +224,7 @@ struct stablelm_hparams { int32_t ftype = 1; }; -struct stablelm_layer { +struct gpt_neox_layer { // pre normalization struct ggml_tensor * ln_1_g; struct ggml_tensor * ln_1_b; @@ -248,8 +248,8 @@ struct stablelm_layer { struct ggml_tensor * c_mlp_proj_b; }; -struct stablelm_model { - stablelm_hparams hparams; +struct gpt_neox_model { + gpt_neox_hparams hparams; // normalization struct ggml_tensor * ln_f_g; @@ -260,7 +260,7 @@ struct stablelm_model { struct ggml_tensor * lmh_g; // language model head //struct ggml_tensor * lmh_b; // language model bias - std::vector layers; + std::vector layers; // key + value memory struct ggml_tensor * memory_k; diff --git a/otherarch/tools/common-ggml.cpp b/otherarch/tools/common-ggml.cpp index 3c0bfe286..9843303fc 100644 --- a/otherarch/tools/common-ggml.cpp +++ b/otherarch/tools/common-ggml.cpp @@ -1,12 +1,11 @@ #include "common-ggml.h" #include +#include static const std::map GGML_FTYPE_MAP = { {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, {"q4_1", GGML_FTYPE_MOSTLY_Q4_1}, - {"q4_2", GGML_FTYPE_MOSTLY_Q4_2}, - {"q4_3", GGML_FTYPE_MOSTLY_Q4_3}, {"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, @@ -46,8 +45,6 @@ bool ggml_common_quantize_0( switch (ftype) { case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break; - case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break; case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; @@ -91,7 +88,7 @@ bool ggml_common_quantize_0( } int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; + int32_t ne[4] = { 1, 1, 1, 1 }; for (int i = 0; i < n_dims; ++i) { finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; @@ -100,7 +97,7 @@ bool ggml_common_quantize_0( std::string name(length, 0); finp.read (&name[0], length); - printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype)); + printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); bool quantize = false; @@ -166,31 +163,23 @@ bool ggml_common_quantize_0( switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_2: - { - cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_3: - { - cur_size = ggml_quantize_q4_3_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_0: { - cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q5_1: { - cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q8_0: { - cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -198,7 +187,6 @@ bool ggml_common_quantize_0( case GGML_TYPE_I16: case GGML_TYPE_I32: case GGML_TYPE_Q8_1: - case GGML_TYPE_Q8_1B: case GGML_TYPE_COUNT: { fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); @@ -210,11 +198,11 @@ bool ggml_common_quantize_0( total_size_new += cur_size; printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < hist_cur.size(); ++i) { + for (int i = 0; i < (int) hist_cur.size(); ++i) { hist_all[i] += hist_cur[i]; } - for (int i = 0; i < hist_cur.size(); ++i) { + for (int i = 0; i < (int) hist_cur.size(); ++i) { printf("%5.3f ", hist_cur[i] / (float)nelements); } printf("\n"); @@ -232,12 +220,12 @@ bool ggml_common_quantize_0( { int64_t sum_all = 0; - for (int i = 0; i < hist_all.size(); ++i) { + for (int i = 0; i < (int) hist_all.size(); ++i) { sum_all += hist_all[i]; } printf("%s: hist: ", __func__); - for (int i = 0; i < hist_all.size(); ++i) { + for (int i = 0; i < (int) hist_all.size(); ++i) { printf("%5.3f ", hist_all[i] / (float)sum_all); } printf("\n"); diff --git a/otherarch/tools/common-ggml.h b/otherarch/tools/common-ggml.h index 9b2e35ad1..29ba4ad5f 100644 --- a/otherarch/tools/common-ggml.h +++ b/otherarch/tools/common-ggml.h @@ -2,16 +2,14 @@ #include "ggml.h" -#include #include #include #include +enum ggml_ftype ggml_parse_ftype(const char * str); void ggml_print_ftypes(FILE * fp = stderr); -enum ggml_ftype ggml_parse_ftype(const char * str); - bool ggml_common_quantize_0( std::ifstream & finp, std::ofstream & fout, diff --git a/otherarch/tools/gpt2_quantize.cpp b/otherarch/tools/gpt2_quantize.cpp index 7e9ad7739..a49de5899 100644 --- a/otherarch/tools/gpt2_quantize.cpp +++ b/otherarch/tools/gpt2_quantize.cpp @@ -18,7 +18,7 @@ struct gpt2_hparams { int32_t n_embd = 768; int32_t n_head = 12; int32_t n_layer = 12; - int32_t ftype = 1; + int32_t ftype = 1; }; // quantize a model @@ -60,21 +60,27 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.ftype); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &ftype, sizeof(hparams.ftype)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/otherarch/tools/gptj_quantize.cpp b/otherarch/tools/gptj_quantize.cpp index a9f4232be..5e1c695aa 100644 --- a/otherarch/tools/gptj_quantize.cpp +++ b/otherarch/tools/gptj_quantize.cpp @@ -21,7 +21,7 @@ struct gptj_hparams { int32_t n_head = 16; int32_t n_layer = 28; int32_t n_rot = 64; - int32_t f16 = 1; + int32_t ftype = 1; }; // quantize a model @@ -64,14 +64,20 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -79,7 +85,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &ftype, sizeof(hparams.f16)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/otherarch/tools/gptj_v1_main.cpp b/otherarch/tools/gptj_v1_main.cpp deleted file mode 100644 index 6cc152cb7..000000000 --- a/otherarch/tools/gptj_v1_main.cpp +++ /dev/null @@ -1,145 +0,0 @@ -#include "gptj_v1.cpp" - -int main(int argc, char ** argv) { - ggml_v1_time_init(); - const int64_t t_main_start_us = ggml_v1_time_us(); - - gpt_params params; - params.model = "models/gpt-j-6B/ggml-model.bin"; - - if (utils_gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - if( !isatty(STDIN_FILENO) ){ - std::string line; - while( std::getline(std::cin, line) ){ - params.prompt = params.prompt + "\n" + line; - } - } else { - params.prompt = utils_gpt_random_prompt(rng); - } - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gptj_model_v1 model; - FileFormat file_format = FileFormat::GPTJ_2; - - // load the model - { - const int64_t t_start_us = ggml_v1_time_us(); - - if (legacy_gptj_model_load(params.model, model, vocab, file_format)!=ModelLoadResult::SUCCESS) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_v1_time_us() - t_start_us; - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - legacy_gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); - - for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_v1_time_us(); - - if (!legacy_gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token,file_format)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_v1_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_v1_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_v1_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (int k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_v1_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_v1_free(model.ctx); - - return 0; -} diff --git a/otherarch/tools/gptj_v2_main.cpp b/otherarch/tools/gptj_v2_main.cpp deleted file mode 100644 index 87346476b..000000000 --- a/otherarch/tools/gptj_v2_main.cpp +++ /dev/null @@ -1,145 +0,0 @@ -#include "gptj_v2.cpp" - - -int main(int argc, char ** argv) { - ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-j-6B/ggml-model.bin"; - - if (utils_gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - if( !isatty(STDIN_FILENO) ){ - std::string line; - while( std::getline(std::cin, line) ){ - params.prompt = params.prompt + "\n" + line; - } - } else { - params.prompt = utils_gpt_random_prompt(rng); - } - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gptj_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (gptj_model_load(params.model, model, vocab)==ModelLoadResult::FAIL) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); - printf("\n"); - - std::vector embd; - - // determine the required inference memory per token: - size_t mem_per_token = 0; - gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (int k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (embd.size() > params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - return 0; -} \ No newline at end of file diff --git a/otherarch/tools/neox_quantize.cpp b/otherarch/tools/neox_quantize.cpp index b719a5dc5..602c9d3c4 100644 --- a/otherarch/tools/neox_quantize.cpp +++ b/otherarch/tools/neox_quantize.cpp @@ -14,18 +14,19 @@ #include // default hparams (StableLM 3B) -struct stablelm_hparams { +struct gpt_neox_hparams { int32_t n_vocab = 50257; int32_t n_ctx = 4096; int32_t n_embd = 4096; int32_t n_head = 32; int32_t n_layer = 16; int32_t n_rot = 32; // 0.25 * (n_embd / n_head) + int32_t par_res = 1; // 1 = true, 0 = false int32_t ftype = 1; }; // quantize a model -bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { +bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) { gpt_vocab vocab; printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); @@ -54,7 +55,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fout.write((char *) &magic, sizeof(magic)); } - stablelm_hparams hparams; + gpt_neox_hparams hparams; // load hparams { @@ -64,14 +65,22 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); + finp.read((char *) &hparams.par_res, sizeof(hparams.par_res)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: par_res = %d\n", __func__, hparams.par_res); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -79,7 +88,8 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &ftype, sizeof(hparams.ftype)); + fout.write((char *) &hparams.par_res, sizeof(hparams.par_res)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab @@ -118,7 +128,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & } // usage: -// ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type +// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type // int main(int argc, char ** argv) { ggml_time_init(); @@ -148,7 +158,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { + if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; }