llama2c: reinstate ggmlv3 conversion output + update readme w/ gguf conv

2023-08-22 21:12:29 +01:00 · 2023-08-22 21:12:29 +01:00 · 59f67c69a7
commit 59f67c69a7
parent 0f7cb95352
2 changed files with 83 additions and 79 deletions
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
 options:
  -h, --help                       show this help message and exit
-  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'tokenizer.bin')
  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
 ```
-An example command is as follows:
+An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
-`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
-Now you can use the model with command like:
+For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
-`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
+`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
 Now you can use the model with a command like:
 `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -17,6 +17,9 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
 #define LLAMA_FILE_VERSION_GGJT_V3   3
 //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
 typedef struct {
    int dim; // transformer dimension
@ -614,83 +617,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
    }
 #pragma message("TODO: implement file saving using gguf")
-    (void) vocab;
+    // write_magic
-    (void) model;
+    file.write_u32(LLAMA_FILE_MAGIC_GGJT);   // magic
-    (void) w;
+    file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
-//    // write_magic
+    // write_hparams
-//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(model->hparams.n_vocab);
-//    file.write_u32(LLAMA_FILE_VERSION); // version
+    file.write_u32(model->hparams.n_embd);
-//    // write_hparams
+    file.write_u32(model->hparams.n_mult);
-//    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_head);
-//    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_layer);
-//    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_rot);
-//    file.write_u32(model->hparams.n_head);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
-//    file.write_u32(model->hparams.n_layer);
+
-//    file.write_u32(model->hparams.n_rot);
+    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
-//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+    uint32_t n_vocab = model->hparams.n_vocab;
-//
+    for (uint32_t i = 0; i < n_vocab; i++) {
-//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+        const auto & token_data = vocab->id_to_token.at(i);
-//    uint32_t n_vocab = model->hparams.n_vocab;
+        file.write_u32((uint32_t) token_data.text.size());
-//    for (uint32_t i = 0; i < n_vocab; i++) {
+        file.write_raw(token_data.text.data(), token_data.text.size());
-//        const auto & token_data = vocab->id_to_token.at(i);
+        file.write_raw(&token_data.score, sizeof(token_data.score));
-//        file.write_u32((uint32_t) token_data.tok.size());
+    }
-//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+
-//        file.write_raw(&token_data.score, sizeof(token_data.score));
+    // stuff AK weights into GG weights one by one.
-//    }
+    // w->token_embedding_table -> model->tok_embeddings
-//
+    // float*                   -> struct ggml_tensor
-//    // stuff AK weights into GG weights one by one.
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
-//    // w->token_embedding_table -> model->tok_embeddings
+    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
-//    // float*                   -> struct ggml_tensor
+
-//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
-//    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    //print_row(model->norm, 0);
-//
+
-//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    // for rms-att-weight
-//    //print_row(model->norm, 0);
+    int row_length = model->hparams.n_embd;
-//
+    const auto & hparams = model->hparams;
-//    // for rms-att-weight
+    //int n_ff = model->hparams.n_embd;
-//    int row_length = model->hparams.n_embd;
+    int n_ff = get_n_ff(&hparams);
-//    const auto & hparams = model->hparams;
+
-//    //int n_ff = model->hparams.n_embd;
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
-//    int n_ff = get_n_ff(&hparams);
+        auto & layer = model->layers[i];
-//
+        // 1d
-//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-//        auto & layer = model->layers[i];
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
-//        // 1d
+
-//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-//
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+
-//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-//
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
-//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+    }
-//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+    // write tensors
-//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+    write_tensor(&file, model->tok_embeddings);
-//    }
+    write_tensor(&file, model->norm);
-//    // write tensors
+    write_tensor(&file, model->output); // ?
-//    write_tensor(&file, model->tok_embeddings);
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-//    write_tensor(&file, model->norm);
+        auto & layer = model->layers[i];
-//    write_tensor(&file, model->output); // ?
+
-//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        write_tensor(&file, layer.attention_norm);
-//        auto & layer = model->layers[i];
+        write_tensor(&file, layer.wq);
-//
+        write_tensor(&file, layer.wk);
-//        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wv);
-//        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wo);
-//        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.ffn_norm);
-//        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.w1);
-//        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.w2);
-//        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w3);
-//        write_tensor(&file, layer.w1);
+    }
 //        write_tensor(&file, layer.w2);
 //        write_tensor(&file, layer.w3);
 //    }
 }
 struct train_params get_default_train_params() {
    struct train_params params;
-    params.fn_vocab_model    = "models/ggml-vocab.bin";
+    params.fn_vocab_model    = "tokenizer.bin";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
    params.fn_train_data     = "shakespeare.txt";
    params.fn_checkpoint_in  = "checkpoint.bin";
@ -743,7 +743,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
-    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
    fprintf(stderr, "\n");