From 0f14663a4af42a37f924dd25e3f56c0205c2386a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 8 Jan 2025 20:55:30 +0200 Subject: [PATCH] llama : vocabl private charsmap ggml-ci --- src/llama-vocab.cpp | 24 +++++++++++++----------- src/llama-vocab.h | 2 -- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 25ff27dbe..1c1328b22 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -707,28 +707,28 @@ private: // struct llm_tokenizer_ugm : llm_tokenizer { - llm_tokenizer_ugm(const llama_vocab & vocab) { - if (vocab.precompiled_charsmap.size() > 0) { + llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector & precompiled_charsmap) { + if (precompiled_charsmap.size() > 0) { size_t charsmap_offset = 0; // First four bytes of precompiled_charsmap contains length of binary // blob containing XOR-compressed compact double array (XCDA) entries - uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0]; + uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0]; charsmap_offset += sizeof(xcda_blob_size); - if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) { + if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) { throw std::runtime_error("Index out of array bounds in precompiled charsmap!"); } // Next xcda_blob_size bytes contain entries of XOR-compressed compact // double array (XCDA). Each entry is bit-packed into a 32-bit integer. - xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset]; + xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset]; xcda_array_size = xcda_blob_size / sizeof(uint32_t); charsmap_offset += xcda_blob_size; // Remaining bytes of precompiled charsmap contain null-terminated // replacement strings for prefixes matched by the XCDA. - prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset]; - prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset; + prefix_replacements = &precompiled_charsmap[charsmap_offset]; + prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset; } for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) { @@ -1169,6 +1169,8 @@ private: struct llama_vocab::impl { std::unique_ptr tokenizer; + std::vector precompiled_charsmap; + impl(const llama_vocab & vocab) : vocab(vocab) { } @@ -1195,7 +1197,7 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) { tokenizer = std::make_unique(vocab); break; case LLAMA_VOCAB_TYPE_UGM: - tokenizer = std::make_unique(vocab); + tokenizer = std::make_unique(vocab, precompiled_charsmap); break; case LLAMA_VOCAB_TYPE_RWKV: tokenizer = std::make_unique(vocab); @@ -1334,14 +1336,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) { if (precompiled_charsmap_keyidx != -1) { size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx); const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx); - precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); + pimpl->precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap); #ifdef IS_BIG_ENDIAN // correct endiannes of data in precompiled_charsmap binary blob - uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0]; + uint32_t * xcda_blob_size = (uint32_t *) &pimpl->precompiled_charsmap[0]; *xcda_blob_size = __builtin_bswap32(*xcda_blob_size); assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap); size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t); - uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)]; + uint32_t * xcda_array = (uint32_t *) &pimpl->precompiled_charsmap[sizeof(uint32_t)]; for (size_t i = 0; i < xcda_array_size; ++i) { xcda_array[i] = __builtin_bswap32(xcda_array[i]); } diff --git a/src/llama-vocab.h b/src/llama-vocab.h index f1c735dba..6eb5e97e0 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -73,8 +73,6 @@ struct llama_vocab { bool tokenizer_escape_whitespaces = true; bool tokenizer_treat_whitespace_as_suffix = false; - std::vector precompiled_charsmap; - llama_vocab(); ~llama_vocab();