From c88362d1941497e1dc385c06136b17da17bb4d9e Mon Sep 17 00:00:00 2001 From: ochafik Date: Sat, 26 Aug 2023 19:47:11 +0100 Subject: [PATCH] llama2.c: support copying vocab from a llama gguf model file --- .../convert-llama2c-to-ggml.cpp | 106 ++++++++++++++---- 1 file changed, 82 insertions(+), 24 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index d906c2f07..db9c7817c 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -531,31 +531,87 @@ bool is_ggml_file(const char *filename) { return magic == GGUF_MAGIC; } -llama_vocab::ttype get_token_type(llama_vocab::id id, const llama_vocab::token &text) { - if (id == UNKNOWN_TOKEN_ID) return LLAMA_TOKEN_TYPE_UNKNOWN; - if (id == BOS_TOKEN_ID || id == EOS_TOKEN_ID) return LLAMA_TOKEN_TYPE_CONTROL; - unsigned char byte_val; - if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) { - return LLAMA_TOKEN_TYPE_BYTE; - } - return LLAMA_TOKEN_TYPE_NORMAL; -} - void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { - // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); - llama_file file(filename, "rb"); - const int n_vocab = config->vocab_size; - /* uint32_t max_token_length = */ file.read_u32(); // unused - vocab->id_to_token.resize(n_vocab); - for (int i=0; iid_to_token[i].text = text; - vocab->id_to_token[i].score = score; - vocab->id_to_token[i].type = get_token_type(i, text); - vocab->token_to_id.emplace(text, i); + if (is_ggml_file(filename)) { + struct ggml_context * ctx_data = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ false, + /*.ctx = */ &ctx_data, + }; + + struct gguf_context * ctx = gguf_init_from_file(filename, params); + GGML_ASSERT(ctx != NULL); + + const int model_idx = gguf_find_key(ctx, "tokenizer.ggml.model"); + GGML_ASSERT(model_idx >= 0); + std::string tokenizer_name = gguf_get_val_str(ctx, model_idx); + GGML_ASSERT(tokenizer_name == "llama"); + + const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); + GGML_ASSERT(token_idx >= 0); + + const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); + GGML_ASSERT(score_idx >= 0); + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + const int toktype_idx = gguf_find_key(ctx, "tokenizer.ggml.token_type"); + GGML_ASSERT(toktype_idx >= 0); + const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); + + const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + + vocab->id_to_token.resize(n_vocab); + + for (uint32_t i = 0; i < n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab->token_to_id[word] = i; + + auto & token_data = vocab->id_to_token[i]; + token_data.text = std::move(word); + token_data.score = scores[i]; + token_data.type = (llama_token_type) toktypes[i]; + } + ggml_free(ctx_data); + gguf_free(ctx); + } else { + // assume llama2.c vocabulary + printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); + llama_file file(filename, "rb"); + const int n_vocab = config->vocab_size; + /* uint32_t max_token_length = */ file.read_u32(); // unused + vocab->id_to_token.resize(n_vocab); + for (llama_vocab::id id=0; id", &byte_val) == 1) { + // Text of byte tokens is already in the expected format. + type = LLAMA_TOKEN_TYPE_BYTE; + } else { + type = LLAMA_TOKEN_TYPE_NORMAL; + } + + vocab->id_to_token[id].text = text; + vocab->id_to_token[id].score = score; + vocab->id_to_token[id].type = type; + vocab->token_to_id.emplace(text, id); + } } } @@ -651,6 +707,8 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod gguf_set_val_u32(ctx, "tokenizer.ggml.unknown_token_id", UNKNOWN_TOKEN_ID); gguf_set_val_u32(ctx, "tokenizer.ggml.bos_token_id", BOS_TOKEN_ID); gguf_set_val_u32(ctx, "tokenizer.ggml.eos_token_id", EOS_TOKEN_ID); + gguf_set_val_u32(ctx, "tokenizer.ggml.sep_token_id", -1); + gguf_set_val_u32(ctx, "tokenizer.ggml.pad_token_id", -1); gguf_set_val_u32(ctx, "llama.context_length", model->hparams.n_ctx); gguf_set_val_u32(ctx, "llama.embedding_length", model->hparams.n_embd);