llama-vocab, llama : handle <|eom_id|> Llama-3.1 token

2024-07-30 16:57:47 +02:00 · 2024-07-30 16:57:47 +02:00 · cc50e78fbe
commit cc50e78fbe
parent 7c27a19b2e
3 changed files with 25 additions and 1 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
    return token != -1 && (
        token == llama_token_eos_impl(vocab) ||
-        token == llama_token_eot_impl(vocab)
+        token == llama_token_eot_impl(vocab) ||
        token == llama_token_eom_impl(vocab)
    );
 }
@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
    return vocab.special_eot_id;
 }
 llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
    return vocab.special_eom_id;
 }
 int32_t llama_tokenize_impl(
    const struct llama_vocab & vocab,
                  const char * text,
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -45,6 +45,7 @@ struct llama_vocab {
    id special_suffix_id = -1;
    id special_middle_id = -1;
    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
    id special_eom_id    = -1;
    // tokenizer flags
    bool tokenizer_add_space_prefix = false;
@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
 llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
        const struct llama_vocab & vocab,
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -362,6 +362,7 @@ enum llm_kv {
    LLM_KV_TOKENIZER_SUFFIX_ID,
    LLM_KV_TOKENIZER_MIDDLE_ID,
    LLM_KV_TOKENIZER_EOT_ID,
    LLM_KV_TOKENIZER_EOM_ID,
    LLM_KV_ADAPTER_TYPE,
    LLM_KV_ADAPTER_LORA_ALPHA,
@ -459,6 +460,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_SUFFIX_ID,            "tokenizer.ggml.suffix_token_id"          },
    { LLM_KV_TOKENIZER_MIDDLE_ID,            "tokenizer.ggml.middle_token_id"          },
    { LLM_KV_TOKENIZER_EOT_ID,               "tokenizer.ggml.eot_token_id"             },
    { LLM_KV_TOKENIZER_EOM_ID,               "tokenizer.ggml.eom_token_id"             },
    { LLM_KV_ADAPTER_TYPE,                  "adapter.type"       },
    { LLM_KV_ADAPTER_LORA_ALPHA,            "adapter.lora.alpha" },
@ -5585,6 +5587,7 @@ static void llm_load_vocab(
            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
        };
        for (const auto & it : special_token_types) {
@ -5637,6 +5640,20 @@ static void llm_load_vocab(
                }
            }
        }
        // find EOM token: "<|eom_id|>"
        //
        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
        //       for now, we apply this workaround to find the EOM token based on its text
        if (vocab.special_eom_id == -1) {
            for (const auto & t : vocab.token_to_id) {
                if (t.first == "<|eom_id|>") {
                    vocab.special_eom_id = t.second;
                    break;
                }
            }
        }
    }
    // build special tokens cache