From 56895e28f6630457f6a02d82feee62d05f50c134 Mon Sep 17 00:00:00 2001 From: xaedes Date: Mon, 29 May 2023 02:25:18 +0200 Subject: [PATCH] get vocabulary for exporting training checkpoint to llama compatible model file --- examples/baby-llama/baby-llama-text.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index 34a6d1051..267f44321 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -1943,6 +1943,25 @@ int main(int argc, char ** argv) { struct llama_context * lctx = llama_init_from_file(fn_model, llama_params); + struct llama_vocab vocab; + { + std::vector strings; + std::vector scores; + int n_vocab = llama_n_vocab(lctx); + strings.resize(n_vocab, NULL); + scores.resize(n_vocab, 0); + n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab); + GGML_ASSERT(n_vocab == llama_n_vocab(lctx)); + vocab.id_to_token.resize(n_vocab); + for (int i=0; i train_tokens; if (tokenize_file(lctx, fn_train, train_tokens) < 0) {