llama : vocabl private charsmap
ggml-ci
This commit is contained in:
parent
c949316ca4
commit
0f14663a4a
2 changed files with 13 additions and 13 deletions
|
@ -707,28 +707,28 @@ private:
|
|||
//
|
||||
|
||||
struct llm_tokenizer_ugm : llm_tokenizer {
|
||||
llm_tokenizer_ugm(const llama_vocab & vocab) {
|
||||
if (vocab.precompiled_charsmap.size() > 0) {
|
||||
llm_tokenizer_ugm(const llama_vocab & vocab, const std::vector<char> & precompiled_charsmap) {
|
||||
if (precompiled_charsmap.size() > 0) {
|
||||
size_t charsmap_offset = 0;
|
||||
|
||||
// First four bytes of precompiled_charsmap contains length of binary
|
||||
// blob containing XOR-compressed compact double array (XCDA) entries
|
||||
uint32_t xcda_blob_size = *(const uint32_t *) &vocab.precompiled_charsmap[0];
|
||||
uint32_t xcda_blob_size = *(const uint32_t *) &precompiled_charsmap[0];
|
||||
charsmap_offset += sizeof(xcda_blob_size);
|
||||
if (xcda_blob_size + charsmap_offset >= vocab.precompiled_charsmap.size()) {
|
||||
if (xcda_blob_size + charsmap_offset >= precompiled_charsmap.size()) {
|
||||
throw std::runtime_error("Index out of array bounds in precompiled charsmap!");
|
||||
}
|
||||
|
||||
// Next xcda_blob_size bytes contain entries of XOR-compressed compact
|
||||
// double array (XCDA). Each entry is bit-packed into a 32-bit integer.
|
||||
xcda_array = (const uint32_t *) &vocab.precompiled_charsmap[charsmap_offset];
|
||||
xcda_array = (const uint32_t *) &precompiled_charsmap[charsmap_offset];
|
||||
xcda_array_size = xcda_blob_size / sizeof(uint32_t);
|
||||
charsmap_offset += xcda_blob_size;
|
||||
|
||||
// Remaining bytes of precompiled charsmap contain null-terminated
|
||||
// replacement strings for prefixes matched by the XCDA.
|
||||
prefix_replacements = &vocab.precompiled_charsmap[charsmap_offset];
|
||||
prefix_replacements_size = vocab.precompiled_charsmap.size() - charsmap_offset;
|
||||
prefix_replacements = &precompiled_charsmap[charsmap_offset];
|
||||
prefix_replacements_size = precompiled_charsmap.size() - charsmap_offset;
|
||||
}
|
||||
|
||||
for (unsigned int id = 0; id < vocab.id_to_token.size(); ++id) {
|
||||
|
@ -1169,6 +1169,8 @@ private:
|
|||
struct llama_vocab::impl {
|
||||
std::unique_ptr<llm_tokenizer> tokenizer;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
impl(const llama_vocab & vocab) : vocab(vocab) {
|
||||
}
|
||||
|
||||
|
@ -1195,7 +1197,7 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
|||
tokenizer = std::make_unique<llm_tokenizer_wpm>(vocab);
|
||||
break;
|
||||
case LLAMA_VOCAB_TYPE_UGM:
|
||||
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab);
|
||||
tokenizer = std::make_unique<llm_tokenizer_ugm>(vocab, precompiled_charsmap);
|
||||
break;
|
||||
case LLAMA_VOCAB_TYPE_RWKV:
|
||||
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
|
||||
|
@ -1334,14 +1336,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||
if (precompiled_charsmap_keyidx != -1) {
|
||||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
pimpl->precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
// correct endiannes of data in precompiled_charsmap binary blob
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
|
||||
uint32_t * xcda_blob_size = (uint32_t *) &pimpl->precompiled_charsmap[0];
|
||||
*xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
|
||||
assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
|
||||
size_t xcda_array_size = *xcda_blob_size / sizeof(uint32_t);
|
||||
uint32_t * xcda_array = (uint32_t *) &precompiled_charsmap[sizeof(uint32_t)];
|
||||
uint32_t * xcda_array = (uint32_t *) &pimpl->precompiled_charsmap[sizeof(uint32_t)];
|
||||
for (size_t i = 0; i < xcda_array_size; ++i) {
|
||||
xcda_array[i] = __builtin_bswap32(xcda_array[i]);
|
||||
}
|
||||
|
|
|
@ -73,8 +73,6 @@ struct llama_vocab {
|
|||
bool tokenizer_escape_whitespaces = true;
|
||||
bool tokenizer_treat_whitespace_as_suffix = false;
|
||||
|
||||
std::vector<char> precompiled_charsmap;
|
||||
|
||||
llama_vocab();
|
||||
~llama_vocab();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue