llama2c: reinstate ggmlv3 conversion output + update readme w/ gguf conv

This commit is contained in:
ochafik 2023-08-22 21:12:29 +01:00
parent 0f7cb95352
commit 59f67c69a7
2 changed files with 83 additions and 79 deletions

View file

@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin') --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
``` ```
An example command is as follows: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>` `$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
Now you can use the model with command like: For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5` `$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -17,6 +17,9 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct { typedef struct {
int dim; // transformer dimension int dim; // transformer dimension
@ -614,83 +617,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
} }
#pragma message("TODO: implement file saving using gguf") #pragma message("TODO: implement file saving using gguf")
(void) vocab; // write_magic
(void) model; file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
(void) w; file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
// // write_magic // write_hparams
// file.write_u32(LLAMA_FILE_MAGIC); // magic file.write_u32(model->hparams.n_vocab);
// file.write_u32(LLAMA_FILE_VERSION); // version file.write_u32(model->hparams.n_embd);
// // write_hparams file.write_u32(model->hparams.n_mult);
// file.write_u32(model->hparams.n_vocab); file.write_u32(model->hparams.n_head);
// file.write_u32(model->hparams.n_embd); file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_mult); file.write_u32(model->hparams.n_rot);
// file.write_u32(model->hparams.n_head); file.write_u32(LLAMA_FTYPE_ALL_F32);
// file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_rot); // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// file.write_u32(LLAMA_FTYPE_ALL_F32); uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. const auto & token_data = vocab->id_to_token.at(i);
// uint32_t n_vocab = model->hparams.n_vocab; file.write_u32((uint32_t) token_data.text.size());
// for (uint32_t i = 0; i < n_vocab; i++) { file.write_raw(token_data.text.data(), token_data.text.size());
// const auto & token_data = vocab->id_to_token.at(i); file.write_raw(&token_data.score, sizeof(token_data.score));
// file.write_u32((uint32_t) token_data.tok.size()); }
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score)); // stuff AK weights into GG weights one by one.
// } // w->token_embedding_table -> model->tok_embeddings
// // float* -> struct ggml_tensor
// // stuff AK weights into GG weights one by one. stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
// // w->token_embedding_table -> model->tok_embeddings stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
// // float* -> struct ggml_tensor
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
// stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); //print_row(model->norm, 0);
//
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); // for rms-att-weight
// //print_row(model->norm, 0); int row_length = model->hparams.n_embd;
// const auto & hparams = model->hparams;
// // for rms-att-weight //int n_ff = model->hparams.n_embd;
// int row_length = model->hparams.n_embd; int n_ff = get_n_ff(&hparams);
// const auto & hparams = model->hparams;
// //int n_ff = model->hparams.n_embd; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
// int n_ff = get_n_ff(&hparams); auto & layer = model->layers[i];
// // 1d
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
// auto & layer = model->layers[i]; stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// // 1d
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); }
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); // write tensors
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); write_tensor(&file, model->tok_embeddings);
// } write_tensor(&file, model->norm);
// // write tensors write_tensor(&file, model->output); // ?
// write_tensor(&file, model->tok_embeddings); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
// write_tensor(&file, model->norm); auto & layer = model->layers[i];
// write_tensor(&file, model->output); // ?
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { write_tensor(&file, layer.attention_norm);
// auto & layer = model->layers[i]; write_tensor(&file, layer.wq);
// write_tensor(&file, layer.wk);
// write_tensor(&file, layer.attention_norm); write_tensor(&file, layer.wv);
// write_tensor(&file, layer.wq); write_tensor(&file, layer.wo);
// write_tensor(&file, layer.wk); write_tensor(&file, layer.ffn_norm);
// write_tensor(&file, layer.wv); write_tensor(&file, layer.w1);
// write_tensor(&file, layer.wo); write_tensor(&file, layer.w2);
// write_tensor(&file, layer.ffn_norm); write_tensor(&file, layer.w3);
// write_tensor(&file, layer.w1); }
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
} }
struct train_params get_default_train_params() { struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/ggml-vocab.bin"; params.fn_vocab_model = "tokenizer.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt"; params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin"; params.fn_checkpoint_in = "checkpoint.bin";
@ -743,7 +743,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n"); fprintf(stderr, "\n");