Merge branch 'master' into fix-whitespace

This commit is contained in:
Georgi Gerganov 2023-08-23 23:09:35 +03:00 committed by GitHub
commit d8beb85c74
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 1731 additions and 772 deletions

View file

@ -744,35 +744,3 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
return std::string(result.data(), result.size()); return std::string(result.data(), result.size());
} }
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos) {
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}

View file

@ -120,15 +120,6 @@ std::vector<llama_token> llama_tokenize(
const std::string & text, const std::string & text,
bool add_bos); bool add_bos);
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::string llama_token_to_str( std::string llama_token_to_str(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token); llama_token token);
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);

View file

@ -95,14 +95,17 @@ print("gguf: get model metadata")
block_count = hparams["n_layer"] block_count = hparams["n_layer"]
gguf_writer.add_name(last_dir) gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"]) gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"]) if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
# TOKENIZATION # TOKENIZATION
@ -110,6 +113,8 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
print("gguf: get tokenizer metadata") print("gguf: get tokenizer metadata")
tokens: List[str] = [] tokens: List[str] = []
scores: List[float] = []
toktypes: List[int] = []
merges: List[str] = [] merges: List[str] = []
@ -153,41 +158,30 @@ if Path(dir_model + "/tokenizer.json").is_file():
text = bytearray(pad_token) text = bytearray(pad_token)
tokens.append(text) tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file(): print("gguf: get special token ids")
print("gguf: get special token ids") # Look for special tokens in config.json
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
tokenizer_config = json.load(f) gguf_writer.add_bos_token_id(hparams["bos_token_id"])
# find special token ids if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
if "bos_token" in tokenizer_config: if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_unk_token_id(hparams["unk_token_id"])
if key["content"] == tokenizer_config["bos_token"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config: if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_sep_token_id(hparams["sep_token_id"])
if key["content"] == tokenizer_config["eos_token"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config: if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_pad_token_id(hparams["pad_token_id"])
if key["content"] == tokenizer_config["unk_token"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]:
gguf_writer.add_pad_token_id(key["id"])
# TENSORS # TENSORS
@ -195,8 +189,9 @@ if Path(dir_model + "/tokenizer.json").is_file():
tensor_map = gguf.get_tensor_name_map(ARCH,block_count) tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform # params for qkv transform
n_head = hparams["n_head"] n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head head_dim = hparams["hidden_size"] // n_head
# tensor info # tensor info

View file

@ -733,7 +733,11 @@ class OutputFile:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("LLaMA") ver = None
if (params.n_ctx == 4096):
ver = "v2"
self.gguf.add_name ("LLaMA" if ver == None else "LLaMA " + ver)
self.gguf.add_context_length (params.n_ctx) self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer) self.gguf.add_block_count (params.n_layer)

View file

@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin') --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
``` ```
An example command is as follows: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>` `$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
Now you can use the model with command like: For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5` `$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -17,6 +17,9 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct { typedef struct {
int dim; // transformer dimension int dim; // transformer dimension
@ -49,10 +52,10 @@ typedef struct {
// float* freq_cis_real; // (seq_len, dim/2) // float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2) // float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer // (optional) classifier weights for the logits, on the last layer
//float* wcls; float* wcls;
} TransformerWeights; } TransformerWeights;
void malloc_weights(TransformerWeights* w, Config* p) { void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim](); w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
w->rms_final_weight = new float[p->dim](); w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
} }
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) { int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1; if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1; if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1; if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
return 1;
}
return 0; return 0;
} }
@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
delete w->w2; delete w->w2;
delete w->w3; delete w->w3;
delete w->rms_final_weight; delete w->rms_final_weight;
if (w->wcls) delete w->wcls;
} }
void print_sample_weights(TransformerWeights *w){ void print_sample_weights(TransformerWeights *w){
@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
printf("%f\n", w->w2[0]); printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]); printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]); printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary #pragma message("TODO: implement reading vocabulary using gguf")
if (is_ggml_file(filename)) { // // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
// if (is_ggml_file(filename)) {
struct llama_context_params llama_params = llama_context_default_params(); //
llama_params.vocab_only = true; // struct llama_context_params llama_params = llama_context_default_params();
// llama_params.vocab_only = true;
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); //
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); // struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
const int n_vocab = llama_n_vocab(lctx); //
vocab->id_to_token.resize(n_vocab); // const int n_vocab = llama_n_vocab(lctx);
for (int i=0; i<n_vocab; ++i) { // vocab->id_to_token.resize(n_vocab);
vocab->id_to_token[i].text = llama_token_get_text(lctx, i); // for (int i=0; i<n_vocab; ++i) {
vocab->id_to_token[i].score = llama_token_get_score(lctx, i); // vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
vocab->id_to_token[i].type = llama_token_get_type(lctx, i); // vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i); // vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
} // vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
llama_free(lctx); // }
llama_free_model(lmodel); // llama_free(lctx);
} else { // assume llama2.c vocabulary // llama_free_model(lmodel);
// } else
{ // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
const int n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
float_t score = file.read_f32(); float_t score = file.read_f32();
uint32_t len = file.read_u32(); uint32_t len = file.read_u32();
std::string text = file.read_string(len); std::string text = file.read_string(len);
// Special-case handling of <0xXX> single byte tokens.
char byte_val;
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
char cstr[2] = { byte_val, 0 };
text = cstr;
}
vocab->id_to_token[i].text = text; vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score; vocab->id_to_token[i].score = score;
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
} }
#pragma message("TODO: implement file saving using gguf") #pragma message("TODO: implement file saving using gguf")
(void) vocab; // write_magic
(void) model; file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
(void) w; file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
// // write_magic // write_hparams
// file.write_u32(LLAMA_FILE_MAGIC); // magic file.write_u32(model->hparams.n_vocab);
// file.write_u32(LLAMA_FILE_VERSION); // version file.write_u32(model->hparams.n_embd);
// // write_hparams file.write_u32(model->hparams.n_mult);
// file.write_u32(model->hparams.n_vocab); file.write_u32(model->hparams.n_head);
// file.write_u32(model->hparams.n_embd); file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_mult); file.write_u32(model->hparams.n_rot);
// file.write_u32(model->hparams.n_head); file.write_u32(LLAMA_FTYPE_ALL_F32);
// file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_rot); // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// file.write_u32(LLAMA_FTYPE_ALL_F32); uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. const auto & token_data = vocab->id_to_token.at(i);
// uint32_t n_vocab = model->hparams.n_vocab; file.write_u32((uint32_t) token_data.text.size());
// for (uint32_t i = 0; i < n_vocab; i++) { file.write_raw(token_data.text.data(), token_data.text.size());
// const auto & token_data = vocab->id_to_token.at(i); file.write_raw(&token_data.score, sizeof(token_data.score));
// file.write_u32((uint32_t) token_data.tok.size()); }
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score)); // stuff AK weights into GG weights one by one.
// } // w->token_embedding_table -> model->tok_embeddings
// // float* -> struct ggml_tensor
// // stuff AK weights into GG weights one by one. stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
// // w->token_embedding_table -> model->tok_embeddings stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
// // float* -> struct ggml_tensor
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table); //print_row(model->norm, 0);
//
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); // for rms-att-weight
// //print_row(model->norm, 0); int row_length = model->hparams.n_embd;
// const auto & hparams = model->hparams;
// // for rms-att-weight //int n_ff = model->hparams.n_embd;
// int row_length = model->hparams.n_embd; int n_ff = get_n_ff(&hparams);
// const auto & hparams = model->hparams;
// //int n_ff = model->hparams.n_embd; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
// int n_ff = get_n_ff(&hparams); auto & layer = model->layers[i];
// // 1d
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
// auto & layer = model->layers[i]; stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// // 1d
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); }
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); // write tensors
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); write_tensor(&file, model->tok_embeddings);
// } write_tensor(&file, model->norm);
// // write tensors write_tensor(&file, model->output); // ?
// write_tensor(&file, model->tok_embeddings); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
// write_tensor(&file, model->norm); auto & layer = model->layers[i];
// write_tensor(&file, model->output); // ?
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { write_tensor(&file, layer.attention_norm);
// auto & layer = model->layers[i]; write_tensor(&file, layer.wq);
// write_tensor(&file, layer.wk);
// write_tensor(&file, layer.attention_norm); write_tensor(&file, layer.wv);
// write_tensor(&file, layer.wq); write_tensor(&file, layer.wo);
// write_tensor(&file, layer.wk); write_tensor(&file, layer.ffn_norm);
// write_tensor(&file, layer.wv); write_tensor(&file, layer.w1);
// write_tensor(&file, layer.wo); write_tensor(&file, layer.w2);
// write_tensor(&file, layer.ffn_norm); write_tensor(&file, layer.w3);
// write_tensor(&file, layer.w1); }
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
} }
struct train_params get_default_train_params() { struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/ggml-vocab.bin"; params.fn_vocab_model = "tokenizer.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt"; params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin"; params.fn_checkpoint_in = "checkpoint.bin";
@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header // read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights // read in the Transformer weights
malloc_weights(&weights, &config); malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file)) { return 1; } if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file); fclose(file);
} }

View file

@ -43,7 +43,7 @@ static bool is_interacting = false;
void sigint_handler(int signo) { void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting) {
is_interacting=true; is_interacting = true;
} else { } else {
console::cleanup(); console::cleanup();
printf("\n"); printf("\n");
@ -189,10 +189,12 @@ int main(int argc, char ** argv) {
} }
} }
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, true); embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
} else { } else {
embd_inp = session_tokens; embd_inp = session_tokens;
} }
@ -208,9 +210,9 @@ int main(int argc, char ** argv) {
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' '); params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len; guidance_offset = (int)guidance_inp.size() - original_prompt_len;
} }
@ -257,8 +259,8 @@ int main(int argc, char ** argv) {
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {

View file

@ -28,7 +28,6 @@ std::vector<float> softmax(const std::vector<float>& logits) {
} }
void perplexity_v2(llama_context * ctx, const gpt_params & params) { void perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
@ -38,7 +37,13 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride); fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return; return;
} }
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int calc_chunk = params.n_ctx; const int calc_chunk = params.n_ctx;
@ -86,7 +91,7 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
const auto token_org = tokens[batch_start]; const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(ctx);
} }
@ -136,7 +141,6 @@ void perplexity_v2(llama_context * ctx, const gpt_params & params) {
} }
void perplexity(llama_context * ctx, const gpt_params & params) { void perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) { if (params.ppl_stride > 0) {
perplexity_v2(ctx, params); perplexity_v2(ctx, params);
return; return;
@ -146,7 +150,13 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int n_chunk_max = tokens.size() / params.n_ctx; const int n_chunk_max = tokens.size() / params.n_ctx;
@ -177,7 +187,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
const auto token_org = tokens[batch_start]; const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(ctx);
} }
@ -295,8 +305,10 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t hs_task_count = prompt_lines.size()/6; size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// This is needed as usual for LLaMA models // This is needed as usual for LLaMA models
bool prepend_bos = true; const bool add_bos = is_spm;
// Number of tasks to use when computing the score // Number of tasks to use when computing the score
if ( params.hellaswag_tasks < hs_task_count ) { if ( params.hellaswag_tasks < hs_task_count ) {
@ -352,14 +364,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
// Tokenize the context to count tokens // Tokenize the context to count tokens
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos); std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
size_t context_size = context_embd.size(); size_t context_size = context_embd.size();
// Do the 1st ending // Do the 1st ending
// In this case we include the context when evaluating // In this case we include the context when evaluating
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos); auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
auto query_size = query_embd.size(); auto query_size = query_embd.size();
//printf("First query: %d\n",(int)query_size); //printf("First query: %d\n",(int)query_size);

View file

@ -238,7 +238,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
alloc->n_free_blocks++; alloc->n_free_blocks++;
} }
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
int pos = 0; int pos = 0;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (list[i] != -1) { if (list[i] != -1) {
@ -547,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
struct ggml_tensor * view_src = get_view_source(parent); struct ggml_tensor * view_src = get_view_source(parent);
struct hash_node * view_src_hn = hash_get(ht, view_src); struct hash_node * view_src_hn = hash_get(ht, view_src);
view_src_hn->n_views -= 1; view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); AT_PRINTF("view_src %s\n", view_src->name);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src); ggml_allocator_free_tensor(alloc, view_src);
} }

View file

@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
// tell the allocator to parse nodes following the order described in the list // tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order // you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);

View file

@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
// TODO: this implementation is wrong!
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
// const float p_delta, const int p_delta_rows, const float theta_scale) {
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
//
// if (col >= ncols) {
// return;
// }
//
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
// const int i = row*ncols + col/2;
//
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
// const float sin_theta = sinf(theta);
// const float cos_theta = cosf(theta);
//
// const float x0 = x[i + 0];
// const float x1 = x[i + ncols/2];
//
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
//}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
const int col = blockDim.x*blockIdx.x + threadIdx.x; const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4; const int half_n_dims = ncols/4;
@ -5515,7 +5538,8 @@ inline void ggml_cuda_op_rope(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_glm = mode & 4; const bool is_neox = mode & 2;
const bool is_glm = mode & 4;
// compute // compute
if (is_glm) { if (is_glm) {
@ -5523,6 +5547,9 @@ inline void ggml_cuda_op_rope(
const float id_p = min(p, n_ctx - 2.f); const float id_p = min(p, n_ctx - 2.f);
const float block_p = max(p - (n_ctx - 2.f), 0.f); const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
} else if (is_neox) {
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
#pragma message("TODO: implement RoPE NeoX for CUDA")
} else { } else {
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);

View file

@ -167,7 +167,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \ fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \
if (error) { \ if (error) { \
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \
@ -218,12 +220,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#undef GGML_METAL_ADD_KERNEL #undef GGML_METAL_ADD_KERNEL
} }
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
} else { } else {
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
} }
return ctx; return ctx;
@ -537,8 +539,8 @@ void ggml_metal_graph_compute(
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
for (int ind = node_start; ind < node_end; ++ind) { for (int ind = node_start; ind < node_end; ++ind) {
const int i = has_concur ? ctx->concur_list[ind] : ind; const int i = has_concur ? ctx->concur_list[ind] : ind;
@ -744,32 +746,31 @@ void ggml_metal_graph_compute(
[ctx->device supportsFamily:MTLGPUFamilyApple7] && [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00%32 == 0 && ne00%32 == 0 &&
ne11 > 1) { ne11 > 1) {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break; case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break; case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break; case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break; case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break; case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} }
else { [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else {
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
@ -868,24 +869,24 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15]; [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16]; [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else #else
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif #endif
} }
else if (src0t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -938,16 +939,17 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_NORM: case GGML_OP_NORM:
{ {
const float eps = 1e-5f; float eps;
memcpy(&eps, dst->op_params, sizeof(float));
const int nth = 256; const int nth = 256;
[encoder setComputePipelineState:ctx->pipeline_norm]; [encoder setComputePipelineState:ctx->pipeline_norm];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
[encoder setBytes:&eps length:sizeof( float) atIndex:4]; [encoder setBytes:&eps length:sizeof( float) atIndex:4];
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
const int64_t nrows = ggml_nrows(src0); const int64_t nrows = ggml_nrows(src0);
@ -990,7 +992,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
const int nth = 32; const int nth = 32;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
@ -1005,8 +1009,8 @@ void ggml_metal_graph_compute(
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
[encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setComputePipelineState:ctx->pipeline_rope];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
@ -1057,24 +1061,24 @@ void ggml_metal_graph_compute(
default: GGML_ASSERT(false && "not implemented"); default: GGML_ASSERT(false && "not implemented");
} }
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;

View file

@ -87,7 +87,12 @@ kernel void kernel_gelu(
device float * dst, device float * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; float x = src0[tpig];
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
// BEWARE !!!
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
// This was observed with Falcon 7B and 40B models
//
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
} }
kernel void kernel_soft_max( kernel void kernel_soft_max(
@ -571,7 +576,25 @@ kernel void kernel_rope(
dst_data[1] = x0*sin_theta + x1*cos_theta; dst_data[1] = x0*sin_theta + x1*cos_theta;
} }
} else { } else {
// TODO: implement for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
}
}
} }
} }

30
ggml.c
View file

@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
static const float GELU_COEF_A = 0.044715f; static const float GELU_COEF_A = 0.044715f;
static const float GELU_QUICK_COEF = -1.702f; static const float GELU_QUICK_COEF = -1.702f;
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
inline static float ggml_gelu_f32(float x) { inline static float ggml_gelu_f32(float x) {
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
is_node = true; is_node = true;
} }
if (ggml_are_same_shape(a, b) && !is_node) {
return a;
}
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
result->op = GGML_OP_REPEAT; result->op = GGML_OP_REPEAT;
@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
static struct ggml_tensor * ggml_norm_impl( static struct ggml_tensor * ggml_norm_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
float eps,
bool inplace) { bool inplace) {
bool is_node = false; bool is_node = false;
@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
// TODO: maybe store epsilon here? ggml_set_op_params(result, &eps, sizeof(eps));
result->op = GGML_OP_NORM; result->op = GGML_OP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl(
struct ggml_tensor * ggml_norm( struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a,
return ggml_norm_impl(ctx, a, false); float eps) {
return ggml_norm_impl(ctx, a, eps, false);
} }
struct ggml_tensor * ggml_norm_inplace( struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a,
return ggml_norm_impl(ctx, a, true); float eps) {
return ggml_norm_impl(ctx, a, eps, true);
} }
// ggml_rms_norm // ggml_rms_norm
@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
const float eps = 1e-5f; // TODO: make this a parameter float eps;
memcpy(&eps, dst->op_params, sizeof(float));
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {
@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {

7
ggml.h
View file

@ -909,14 +909,15 @@ extern "C" {
struct ggml_tensor * b); struct ggml_tensor * b);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_norm_inplace( GGML_API struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_rms_norm( GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,

26
gguf.py
View file

@ -30,12 +30,12 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
KEY_GENERAL_FILE_TYPE = "general.file_type" KEY_GENERAL_FILE_TYPE = "general.file_type"
# LLM # LLM
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length" KEY_CONTEXT_LENGTH = "{arch}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length" KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{arch}.block_count" KEY_BLOCK_COUNT = "{arch}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
# attention # attention
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
@ -583,7 +583,7 @@ class GGUFWriter:
self.add_string(KEY_GENERAL_AUTHOR, author) self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(KEY_GENERAL_URL, url) self.add_string(KEY_GENERAL_URL, url)
@ -613,27 +613,27 @@ class GGUFWriter:
def add_context_length(self, length: int): def add_context_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length) KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
def add_embedding_length(self, length: int): def add_embedding_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length) KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_block_count(self, length: int): def add_block_count(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length) KEY_BLOCK_COUNT.format(arch=self.arch), length)
def add_feed_forward_length(self, length: int): def add_feed_forward_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length) KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
def add_parallel_residual(self, use: bool): def add_parallel_residual(self, use: bool):
self.add_bool( self.add_bool(
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string( self.add_string(
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_head_count(self, count: int): def add_head_count(self, count: int):
self.add_uint32( self.add_uint32(

1817
llama.cpp

File diff suppressed because it is too large Load diff

15
llama.h
View file

@ -247,6 +247,8 @@ extern "C" {
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
LLAMA_API int llama_model_n_vocab(const struct llama_model * model); LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model); LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model); LLAMA_API int llama_model_n_embd (const struct llama_model * model);
@ -368,13 +370,6 @@ extern "C" {
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_tokenize_bpe(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_tokenize_with_model( LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
@ -390,12 +385,6 @@ extern "C" {
char * buf, char * buf,
int length); int length);
LLAMA_API int llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token,
char * buf,
int length);
LLAMA_API int llama_token_to_str_with_model( LLAMA_API int llama_token_to_str_with_model(
const struct llama_model * model, const struct llama_model * model,
llama_token token, llama_token token,

View file

@ -28,7 +28,8 @@ llama_build_and_test_executable(test-sampling.cpp)
llama_build_executable(test-tokenizer-0.cpp) llama_build_executable(test-tokenizer-0.cpp)
llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-0.llama test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-1.cpp) llama_build_executable(test-tokenizer-1.cpp)
llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) # test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-llama-grammar.cpp)

View file

@ -67,11 +67,13 @@ int main(int argc, char **argv) {
} }
} }
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
std::string forward = llama_token_to_str_bpe(ctx, i); std::string forward = llama_token_to_str(ctx, i);
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false); std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
if (tokens.size() == 1) { if (tokens.size() == 1) {
if (i != tokens[0]) { if (i != tokens[0]) {
std::string backward = llama_token_to_str(ctx, tokens[0]); std::string backward = llama_token_to_str(ctx, tokens[0]);
@ -79,16 +81,6 @@ int main(int argc, char **argv) {
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
return 2; return 2;
} }
} else {
llama_token_type type = llama_token_get_type(ctx, i);
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
} else {
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
return 2;
}
} }
} }