llama : add handling of byte tokens in UGM tokenizer (same as in SPM)

llama : fix preventing crashes when precompiled_charsmap is not present
2024-06-24 17:39:41 +02:00 · 2024-06-24 17:39:41 +02:00 · f4c03c0966
commit f4c03c0966
parent c2c799cefa
1 changed files with 39 additions and 35 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -13335,7 +13335,8 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
    GGML_ASSERT(llama_is_byte_token(vocab, id));
    const auto & token_data = vocab.id_to_token.at(id);
    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
            auto buf = token_data.text.substr(3, 2);
            return strtol(buf.c_str(), NULL, 16);
        }
@ -13355,7 +13356,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
    GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
    static const char * hex = "0123456789ABCDEF";
    switch (llama_vocab_get_type(vocab)) {
-        case LLAMA_VOCAB_TYPE_SPM: {
+        case LLAMA_VOCAB_TYPE_SPM:
+        case LLAMA_VOCAB_TYPE_UGM: {
            const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
            auto token = vocab.token_to_id.find(buf);
            if (token != vocab.token_to_id.end()) {
@ -14242,6 +14244,7 @@ private:
        size_t longest_prefix_length = 0;
        size_t longest_prefix_offset = 0;

+        if (xcda_array_size > 0) {
            struct xcda_array_view xcda_view(xcda_array, xcda_array_size);

            // Find the longest normalized sequence matching the input prefix by walking
@ -14274,6 +14277,7 @@ private:
                    longest_prefix_offset = xcda_view.get_value(node_index);
                }
            }
+        }

        if (longest_prefix_length > 0) {
            // we have a match, so return the replacement sequence
@ -14299,11 +14303,11 @@ private:
    // escaped space symbol - U+2581 (Lower One Eighth Block)
    const std::string escaped_space = "\xE2\x96\x81";

-    char * prefix_replacements;
-    size_t prefix_replacements_size;
+    char * prefix_replacements = NULL;
+    size_t prefix_replacements_size = 0;

-    uint32_t * xcda_array;
-    size_t xcda_array_size;
+    uint32_t * xcda_array = NULL;
+    size_t xcda_array_size = 0;

    struct naive_trie user_defined_token_matcher;