llama : vocabl private charsmap

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-08 20:55:30 +02:00
parent c949316ca4
commit 0f14663a4a
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 13 additions and 13 deletions

View file

@ -707,28 +707,28 @@ private:
//
struct llm_tokenizer_ugm : llm_tokenizer {
llm_tokenizer_ugm(const llama_vocab & vocab) {
if (vocab.precompiled_charsmap.size() > 0) {
llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
if (precompiled_charsmap.size() > 0) {
size_t charsmap_offset = 0;
// First four bytes of precompiled_charsmap contains length of binary
// blob containing XOR-compressed compact double array (XCDA) entries
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
charsmap_offset += sizeof(xcda_blob_size);
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
}
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
charsmap_offset += xcda_blob_size;
// Remaining bytes of precompiled charsmap contain null-terminated
// replacement strings for prefixes matched by the XCDA.
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
prefix_replacements = &precompiled_charsmap[charsmap_offset];
prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
}
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
@ -1169,6 +1169,8 @@ private:
struct llama_vocab::impl {
std::unique_ptr<llm_tokenizer> tokenizer;
std::vector<char> precompiled_charsmap;
impl(const llama_vocab & vocab) : vocab(vocab) {
}
@ -1195,7 +1197,7 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
break;
case LLAMA_VOCAB_TYPE_UGM:
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab);
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
break;
case LLAMA_VOCAB_TYPE_RWKV:
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
@ -1334,14 +1336,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
if (precompiled_charsmap_keyidx != -1) {
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
pimpl->precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
#ifdef IS_BIG_ENDIAN
// correct endiannes of data in precompiled_charsmap binary blob
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
uint32_t * xcda_blob_size = (uint32_t *) &pimpl->precompiled_charsmap[0];
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
uint32_t * xcda_array = (uint32_t *) &pimpl->precompiled_charsmap[sizeof(uint32_t)];
for (size_t i = 0; i < xcda_array_size; ++i) {
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
}

View file

@ -73,8 +73,6 @@ struct llama_vocab {
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llama_vocab();
~llama_vocab();