updated tokenizer, added support for scratch buffers for neox and gpt2

This commit is contained in:
Concedo 2023-06-19 21:29:06 +08:00
parent cb6daa3171
commit 8e2dc19dc6
5 changed files with 115 additions and 75 deletions

View file

@ -387,9 +387,15 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
{ {
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str()); printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
const char * lora_base_arg = NULL;
if (lora_base != "") {
printf("Using LORA base model: %s\n", lora_base.c_str());
lora_base_arg = lora_base.c_str();
}
int err = llama_apply_lora_from_file(llama_ctx_v3, int err = llama_apply_lora_from_file(llama_ctx_v3,
lora_filename.c_str(), lora_filename.c_str(),
NULL, lora_base_arg,
n_threads); n_threads);
if (err != 0) if (err != 0)
{ {

File diff suppressed because one or more lines are too long

View file

@ -90,9 +90,19 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
// if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); // if (i < 10) fprintf(stderr, "%.s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
} }
}
auto memory_type = GGML_TYPE_F16; // Add StarChat special tokens.
for (const std::string & token : {
"<|system|>",
"<|user|>",
"<|assistant|>",
"<|end|>",
}) {
if (vocab.token_to_id.find(token) != vocab.token_to_id.end()) {
vocab.add_special_token(token);
}
}
}
// for the big tensors, we have the option to store the data in 16-bit floats or quantized // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation // in order to save memory and also to speed up the computation
@ -144,10 +154,10 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
ctx_size += (6 + 12*n_layer)*512; // object overhead ctx_size += (6 + 12*n_layer)*1024; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
} }
@ -159,7 +169,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
params.mem_buffer = NULL; params.mem_buffer = NULL;
params.no_alloc = false; params.no_alloc = false;
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__); fprintf(stderr, "%s: ggml_init() failed\n", __func__);
@ -250,8 +259,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
const int n_mem = n_layer*n_ctx; const int n_mem = n_layer*n_ctx;
const int n_elements = n_embd*n_mem; const int n_elements = n_embd*n_mem;
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements); model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
@ -293,14 +302,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
} }
auto tensor = model.tensors[name.data()]; auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return ModelLoadResult::FAIL; return ModelLoadResult::FAIL;
} }
if (ggml_nelements(tensor) != nelements) {
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file. got %d, expected %d\n",
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n", __func__, name.data(), (int) ggml_nelements(tensor), nelements);
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return ModelLoadResult::FAIL; return ModelLoadResult::FAIL;
} }
@ -336,7 +345,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.close(); fin.close();
return ModelLoadResult::SUCCESS; return ModelLoadResult::SUCCESS;
} }
@ -369,8 +377,16 @@ bool gpt2_eval(
static size_t buf_size = 256u*1024*1024; static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { // use 2 scratch buffers
const size_t buf_size_new = 320u*1024*1024 + 1.6*(mem_per_token*N); // add 10% to account for ggml object overhead // TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
static void * scr0 = malloc(scr0_size);
static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
static void * scr1 = malloc(scr1_size);
if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) {
const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate // reallocate
@ -413,6 +429,8 @@ bool gpt2_eval(
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur; struct ggml_tensor * cur;
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
// norm // norm
{ {
// [ 768, N] // [ 768, N]
@ -559,6 +577,8 @@ bool gpt2_eval(
struct ggml_tensor * inpFF = cur; struct ggml_tensor * inpFF = cur;
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
// feed-forward network // feed-forward network
{ {
// norm // norm
@ -615,6 +635,8 @@ bool gpt2_eval(
inpL = ggml_add(ctx0, cur, inpFF); inpL = ggml_add(ctx0, cur, inpFF);
} }
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
// norm // norm
{ {
// [ 768, N] // [ 768, N]
@ -629,6 +651,8 @@ bool gpt2_eval(
ggml_repeat(ctx0, model.ln_f_b, inpL)); ggml_repeat(ctx0, model.ln_f_b, inpL));
} }
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
// inpL = WTE * inpL // inpL = WTE * inpL
// [ 768, 50257] - model.lm_head // [ 768, 50257] - model.lm_head
// [ 768, N] - inpL // [ 768, N] - inpL

View file

@ -38,21 +38,14 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
// load hparams // load hparams
{ {
auto & hparams = model.hparams; auto & hparams = model.hparams;
hparams.par_res = 1; //true
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
if(file_format!=FileFormat::NEOX_1 && file_format!=FileFormat::NEOX_2 && file_format!=FileFormat::NEOX_3) fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
{
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
}
if(file_format==FileFormat::NEOX_3)
{
hparams.par_res = 0;
}
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
@ -107,10 +100,10 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd; const size_t n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer; const size_t n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx; const size_t n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const size_t n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
@ -141,7 +134,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
ctx_size += (6 + 16*n_layer)*512; // object overhead ctx_size += (6 + 16*n_layer)*1024; // object overhead
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
} }
@ -300,22 +293,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
} }
size_t bpe = ggml_type_size(ggml_type(ttype)); const size_t bpe = ggml_type_size(ggml_type(ttype));
if(file_format==FileFormat::NEOX_1)
{
switch (ttype) {
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
default:
{
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
return ModelLoadResult::FAIL;
}
};
}
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
@ -409,8 +387,16 @@ bool gpt_neox_eval(
static size_t buf_size = 256u*1024*1024; static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { // use 2 scratch buffers
const size_t buf_size_new = 360u*1024*1024 + 1.6*(mem_per_token*N); // add 10% to account for ggml object overhead // TODO: very hacky solution - reimplement in a more elegant way
static size_t scr0_size = (n_ctx>1024?512u:256u)*1024*1024;
static void * scr0 = malloc(scr0_size);
static size_t scr1_size = (n_ctx>1024?512u:256u)*1024*1024;
static void * scr1 = malloc(scr1_size);
if (mem_per_token > 0 && mem_per_token*N*1.05 > buf_size) {
const size_t buf_size_new = 64u*1024*1024 + 1.15*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate // reallocate
@ -445,6 +431,8 @@ bool gpt_neox_eval(
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur; struct ggml_tensor * cur;
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
// self-attention // self-attention
{ {
{ {
@ -548,6 +536,8 @@ bool gpt_neox_eval(
} }
} }
ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
if (hparams.par_res == 0) { if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
@ -570,6 +560,8 @@ bool gpt_neox_eval(
} }
} }
ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL);
@ -582,6 +574,8 @@ bool gpt_neox_eval(
ggml_repeat(ctx0, model.ln_f_b, inpL)); ggml_repeat(ctx0, model.ln_f_b, inpL));
} }
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
// lm_head // lm_head
{ {
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);

View file

@ -126,37 +126,53 @@ std::wstring convert_to_wstring(const std::string & input) {
return converter.from_bytes(input); return converter.from_bytes(input);
} }
void gpt_split_words(std::string str, std::vector<std::string>& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words; std::vector<std::string> words;
// first split the text into words // first split the text into words
{ {
std::string str = text; std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
// Generate the subpattern from the special_tokens vector if it's not empty // Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) { if (!vocab.special_tokens.empty()) {
const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
std::string special_tokens_subpattern; std::string special_tokens_subpattern;
for (const auto & token : vocab.special_tokens) { for (const auto & token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) { if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|"; special_tokens_subpattern += "|";
} }
special_tokens_subpattern += token; special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
} }
// Modify the regex pattern with the generated special tokens subpattern std::regex re(special_tokens_subpattern);
pat = special_tokens_subpattern + "|" + pat; std::smatch m;
} // Split the text by special tokens.
while (std::regex_search(str, m, re)) {
std::regex re(pat); // Split the substrings in-between special tokens into words.
std::smatch m; gpt_split_words(m.prefix(), words);
// Add matched special tokens as words.
while (std::regex_search(str, m, re)) { for (auto x : m) {
for (auto x : m) { words.push_back(x);
words.push_back(x); }
str = m.suffix();
} }
str = m.suffix(); // Remaining text without special tokens will be handled below.
} }
gpt_split_words(str, words);
} }
// find the longest token that forms each word in words: // find the longest token that forms each word in words: