clean and refactoring pass before supporting newer models for different arch

This commit is contained in:
Concedo 2023-05-17 11:23:29 +08:00
parent 60ee00428b
commit 90fe9096b4
12 changed files with 81 additions and 367 deletions

View file

@ -15,7 +15,6 @@
#include "llamaextra.cpp"
//concat source files into one file for compilation purposes
#include "common-ggml.cpp"
#include "utils.cpp"
#include "gptj_v1.cpp"
#include "gptj_v2.cpp"
@ -33,7 +32,7 @@ static gptj_model_v1 gptj_ctx_v1;
static gptj_model gptj_ctx_v2;
static gpt2_v1_model gpt2_ctx_v1;
static gpt2_model gpt2_ctx_v2;
static stablelm_model neox_ctx;
static gpt_neox_model neox_ctx;
static rwkv_context * rwkv_ctx_v1;
static llama_context_params llama_ctx_params;
static llama_context * llama_ctx_v1;
@ -378,7 +377,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
}
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
ModelLoadResult res = stablelm_model_load(params.model, neox_ctx, vocab, file_format);
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@ -394,13 +393,13 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
// determine the required inference memory per token:
stablelm_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
if(logits.size()>0 && (file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_4) && !IsNanCheck(logits[0]))
{
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
stablelm_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));
gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token, (file_format==FileFormat::NEOX_2?FileFormat::NEOX_3:FileFormat::NEOX_5));
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
std::string predicted = vocab.id_to_token[topid].c_str();
if(predicted.find("8") != std::string::npos)
@ -695,7 +694,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
}
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
{
evalres = stablelm_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
}
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
{

View file

@ -2,7 +2,6 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>

View file

@ -2,7 +2,6 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>

View file

@ -2,7 +2,6 @@
#include "otherarch.h"
#include "utils.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
@ -17,7 +16,7 @@
// load the model's weights from a file
ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_vocab & vocab, FileFormat file_format) {
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -340,8 +339,8 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
bool stablelm_eval(
const stablelm_model & model,
bool gpt_neox_eval(
const gpt_neox_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
@ -497,7 +496,7 @@ bool stablelm_eval(
}
}
if(file_format==FileFormat::NEOX_3)
if(file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)
{
// layer input + Attn
cur = ggml_add(ctx0, cur, inpL);
@ -511,7 +510,7 @@ bool stablelm_eval(
// post attention layer norm
// note here we pass inpL instead of cur
{
cur = ggml_norm(ctx0, (file_format==FileFormat::NEOX_3?cur:inpL));
cur = ggml_norm(ctx0, ((file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)?cur:inpL));
cur = ggml_add(ctx0,
ggml_mul(ctx0,
@ -542,7 +541,7 @@ bool stablelm_eval(
cur);
}
if (file_format == FileFormat::NEOX_3)
if (file_format==FileFormat::NEOX_3||file_format==FileFormat::NEOX_5)
{
// layer input + FF
inpL = ggml_add(ctx0, cur, inpFF);

View file

@ -214,7 +214,7 @@ struct gpt2_model {
};
// default hparams (StableLM 3B)
struct stablelm_hparams {
struct gpt_neox_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 4096;
int32_t n_embd = 4096;
@ -224,7 +224,7 @@ struct stablelm_hparams {
int32_t ftype = 1;
};
struct stablelm_layer {
struct gpt_neox_layer {
// pre normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
@ -248,8 +248,8 @@ struct stablelm_layer {
struct ggml_tensor * c_mlp_proj_b;
};
struct stablelm_model {
stablelm_hparams hparams;
struct gpt_neox_model {
gpt_neox_hparams hparams;
// normalization
struct ggml_tensor * ln_f_g;
@ -260,7 +260,7 @@ struct stablelm_model {
struct ggml_tensor * lmh_g; // language model head
//struct ggml_tensor * lmh_b; // language model bias
std::vector<stablelm_layer> layers;
std::vector<gpt_neox_layer> layers;
// key + value memory
struct ggml_tensor * memory_k;

View file

@ -1,12 +1,11 @@
#include "common-ggml.h"
#include <regex>
#include <map>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q4_3", GGML_FTYPE_MOSTLY_Q4_3},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@ -46,8 +45,6 @@ bool ggml_common_quantize_0(
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q4_3: qtype = GGML_TYPE_Q4_3; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@ -91,7 +88,7 @@ bool ggml_common_quantize_0(
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
int32_t ne[4] = { 1, 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
@ -100,7 +97,7 @@ bool ggml_common_quantize_0(
std::string name(length, 0);
finp.read (&name[0], length);
printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
bool quantize = false;
@ -166,31 +163,23 @@ bool ggml_common_quantize_0(
switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
cur_size = ggml_quantize_q4_2_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_3:
{
cur_size = ggml_quantize_q4_3_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1:
{
cur_size = ggml_quantize_q5_1_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0:
{
cur_size = ggml_quantize_q8_0_v2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
@ -198,7 +187,6 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_1B:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
@ -210,11 +198,11 @@ bool ggml_common_quantize_0(
total_size_new += cur_size;
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < hist_cur.size(); ++i) {
for (int i = 0; i < (int) hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}
for (int i = 0; i < hist_cur.size(); ++i) {
for (int i = 0; i < (int) hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
@ -232,12 +220,12 @@ bool ggml_common_quantize_0(
{
int64_t sum_all = 0;
for (int i = 0; i < hist_all.size(); ++i) {
for (int i = 0; i < (int) hist_all.size(); ++i) {
sum_all += hist_all[i];
}
printf("%s: hist: ", __func__);
for (int i = 0; i < hist_all.size(); ++i) {
for (int i = 0; i < (int) hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");

View file

@ -2,16 +2,14 @@
#include "ggml.h"
#include <map>
#include <fstream>
#include <vector>
#include <string>
enum ggml_ftype ggml_parse_ftype(const char * str);
void ggml_print_ftypes(FILE * fp = stderr);
enum ggml_ftype ggml_parse_ftype(const char * str);
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,

View file

@ -18,7 +18,7 @@ struct gpt2_hparams {
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
int32_t ftype = 1;
int32_t ftype = 1;
};
// quantize a model
@ -60,21 +60,27 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.ftype);
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &ftype, sizeof(hparams.ftype));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab

View file

@ -21,7 +21,7 @@ struct gptj_hparams {
int32_t n_head = 16;
int32_t n_layer = 28;
int32_t n_rot = 64;
int32_t f16 = 1;
int32_t ftype = 1;
};
// quantize a model
@ -64,14 +64,20 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -79,7 +85,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &ftype, sizeof(hparams.f16));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab

View file

@ -1,145 +0,0 @@
#include "gptj_v1.cpp"
int main(int argc, char ** argv) {
ggml_v1_time_init();
const int64_t t_main_start_us = ggml_v1_time_us();
gpt_params params;
params.model = "models/gpt-j-6B/ggml-model.bin";
if (utils_gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.seed < 0) {
params.seed = time(NULL);
}
printf("%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.prompt.empty()) {
if( !isatty(STDIN_FILENO) ){
std::string line;
while( std::getline(std::cin, line) ){
params.prompt = params.prompt + "\n" + line;
}
} else {
params.prompt = utils_gpt_random_prompt(rng);
}
}
int64_t t_load_us = 0;
gpt_vocab vocab;
gptj_model_v1 model;
FileFormat file_format = FileFormat::GPTJ_2;
// load the model
{
const int64_t t_start_us = ggml_v1_time_us();
if (legacy_gptj_model_load(params.model, model, vocab, file_format)!=ModelLoadResult::SUCCESS) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
t_load_us = ggml_v1_time_us() - t_start_us;
}
int n_past = 0;
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
printf("\n");
std::vector<gpt_vocab::id> embd;
// determine the required inference memory per token:
size_t mem_per_token = 0;
legacy_gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_v1_time_us();
if (!legacy_gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token,file_format)) {
printf("Failed to predict\n");
return 1;
}
t_predict_us += ggml_v1_time_us() - t_start_us;
}
n_past += embd.size();
embd.clear();
if (i >= embd_inp.size()) {
// sample next token
const int top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const int n_vocab = model.hparams.n_vocab;
gpt_vocab::id id = 0;
{
const int64_t t_start_sample_us = ggml_v1_time_us();
id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
t_sample_us += ggml_v1_time_us() - t_start_sample_us;
}
// add it to the context
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
for (int k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
if (embd.size() > params.n_batch) {
break;
}
}
i += embd.size() - 1;
}
// display text
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
fflush(stdout);
// end of text token
if (embd.back() == 50256) {
break;
}
}
// report timing
{
const int64_t t_main_end_us = ggml_v1_time_us();
printf("\n\n");
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
ggml_v1_free(model.ctx);
return 0;
}

View file

@ -1,145 +0,0 @@
#include "gptj_v2.cpp"
int main(int argc, char ** argv) {
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
gpt_params params;
params.model = "models/gpt-j-6B/ggml-model.bin";
if (utils_gpt_params_parse(argc, argv, params) == false) {
return 1;
}
if (params.seed < 0) {
params.seed = time(NULL);
}
printf("%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.prompt.empty()) {
if( !isatty(STDIN_FILENO) ){
std::string line;
while( std::getline(std::cin, line) ){
params.prompt = params.prompt + "\n" + line;
}
} else {
params.prompt = utils_gpt_random_prompt(rng);
}
}
int64_t t_load_us = 0;
gpt_vocab vocab;
gptj_model model;
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (gptj_model_load(params.model, model, vocab)==ModelLoadResult::FAIL) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
t_load_us = ggml_time_us() - t_start_us;
}
int n_past = 0;
int64_t t_sample_us = 0;
int64_t t_predict_us = 0;
std::vector<float> logits;
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
printf("\n");
std::vector<gpt_vocab::id> embd;
// determine the required inference memory per token:
size_t mem_per_token = 0;
gptj_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// predict
if (embd.size() > 0) {
const int64_t t_start_us = ggml_time_us();
if (!gptj_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
printf("Failed to predict\n");
return 1;
}
t_predict_us += ggml_time_us() - t_start_us;
}
n_past += embd.size();
embd.clear();
if (i >= embd_inp.size()) {
// sample next token
const int top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const int n_vocab = model.hparams.n_vocab;
gpt_vocab::id id = 0;
{
const int64_t t_start_sample_us = ggml_time_us();
id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
t_sample_us += ggml_time_us() - t_start_sample_us;
}
// add it to the context
embd.push_back(id);
} else {
// if here, it means we are still processing the input prompt
for (int k = i; k < embd_inp.size(); k++) {
embd.push_back(embd_inp[k]);
if (embd.size() > params.n_batch) {
break;
}
}
i += embd.size() - 1;
}
// display text
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
fflush(stdout);
// end of text token
if (embd.back() == 50256) {
break;
}
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n\n");
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
ggml_free(model.ctx);
return 0;
}

View file

@ -14,18 +14,19 @@
#include <regex>
// default hparams (StableLM 3B)
struct stablelm_hparams {
struct gpt_neox_hparams {
int32_t n_vocab = 50257;
int32_t n_ctx = 4096;
int32_t n_embd = 4096;
int32_t n_head = 32;
int32_t n_layer = 16;
int32_t n_rot = 32; // 0.25 * (n_embd / n_head)
int32_t par_res = 1; // 1 = true, 0 = false
int32_t ftype = 1;
};
// quantize a model
bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -54,7 +55,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
fout.write((char *) &magic, sizeof(magic));
}
stablelm_hparams hparams;
gpt_neox_hparams hparams;
// load hparams
{
@ -64,14 +65,22 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
@ -79,7 +88,8 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &ftype, sizeof(hparams.ftype));
fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
@ -118,7 +128,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
}
// usage:
// ./stablelm2-quantize models/stablelm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
// ./gpt-neox-quantize models/stalellm2-117M/ggml-model.bin models/stablelm2-117M/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
@ -148,7 +158,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!stablelm_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
if (!gpt_neox_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}