refactor: respect special token from metadata

when vocab.type is SPM, we will confirm the linefeed_id by searching the char, and use special_pad_id instead if not found. the special_*_id are usually record in metadata, to ensure the special_pad_id can be used correctly, we need to obtain it from metadata first and then perform the above confirmation logic. Signed-off-by: thxCode <thxcode0824@gmail.com>
2024-08-06 17:10:21 +08:00 · 2024-08-06 17:10:21 +08:00 · 0b90345749
commit 0b90345749
parent bb55b19c04
1 changed files with 80 additions and 80 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5517,60 +5517,6 @@ static void llm_load_vocab(
    }
    GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        // For Fill-In-the-Middle (FIM)/infill models which where converted
        // prior to support of FIM special tokens in GGUF, the following
        // will allow those models to continue to work. The general names
        // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
        // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
        // new versions of these models have been published.
        std::string gen_name;
        ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
        std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
            [](unsigned char c){ return std::tolower(c); });
        if (gen_name.find("code") != std::string::npos) {
            if (model.arch == LLM_ARCH_LLAMA
              && 32010 < vocab.id_to_token.size()
              && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
              && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
              && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
              && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
                vocab.special_prefix_id = 32007;
                vocab.special_suffix_id = 32008;
                vocab.special_middle_id = 32009;
                vocab.special_eot_id    = 32010;
            } else if (model.arch == LLM_ARCH_GEMMA
              && 107 < vocab.id_to_token.size()
              && vocab.id_to_token[67].text == "<|fim_prefix|>"
              && vocab.id_to_token[69].text == "<|fim_suffix|>"
              && vocab.id_to_token[68].text == "<|fim_middle|>"
              && vocab.id_to_token[107].text == "<end_of_turn>") {
                vocab.special_prefix_id = 67;
                vocab.special_suffix_id = 69;
                vocab.special_middle_id = 68;
                // TODO: this is not EOT, it is "file separator" token, needs fix
                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
                //vocab.special_eot_id    = 70;
                vocab.special_eot_id    = 107;
            }
        }
        try {
            vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
        } catch (const std::exception & e) {
            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
            vocab.linefeed_id = vocab.special_pad_id;
        }
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {
        const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
        vocab.linefeed_id = ids[0];
    }
    // special tokens
    {
      const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
@ -5651,6 +5597,60 @@ static void llm_load_vocab(
      }
    }
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        // For Fill-In-the-Middle (FIM)/infill models which where converted
        // prior to support of FIM special tokens in GGUF, the following
        // will allow those models to continue to work. The general names
        // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and
        // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once
        // new versions of these models have been published.
        std::string gen_name;
        ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false);
        std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(),
            [](unsigned char c){ return std::tolower(c); });
        if (gen_name.find("code") != std::string::npos) {
            if (model.arch == LLM_ARCH_LLAMA
              && 32010 < vocab.id_to_token.size()
              && vocab.id_to_token[32007].text.find("<PRE>") != std::string::npos
              && vocab.id_to_token[32008].text.find("<SUF>") != std::string::npos
              && vocab.id_to_token[32009].text.find("<MID>") != std::string::npos
              && vocab.id_to_token[32010].text.find("<EOT>") != std::string::npos) {
                vocab.special_prefix_id = 32007;
                vocab.special_suffix_id = 32008;
                vocab.special_middle_id = 32009;
                vocab.special_eot_id    = 32010;
            } else if (model.arch == LLM_ARCH_GEMMA
              && 107 < vocab.id_to_token.size()
              && vocab.id_to_token[67].text == "<|fim_prefix|>"
              && vocab.id_to_token[69].text == "<|fim_suffix|>"
              && vocab.id_to_token[68].text == "<|fim_middle|>"
              && vocab.id_to_token[107].text == "<end_of_turn>") {
                vocab.special_prefix_id = 67;
                vocab.special_suffix_id = 69;
                vocab.special_middle_id = 68;
                // TODO: this is not EOT, it is "file separator" token, needs fix
                //       https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
                //vocab.special_eot_id    = 70;
                vocab.special_eot_id    = 107;
            }
        }
        try {
            vocab.linefeed_id = llama_byte_to_token_impl(vocab, '\n');
        } catch (const std::exception & e) {
            LLAMA_LOG_WARN("%s: SPM vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, e.what());
            vocab.linefeed_id = vocab.special_pad_id;
        }
    } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) {
        vocab.linefeed_id = vocab.special_pad_id;
    } else {
        const std::vector<int> ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A
        GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
        vocab.linefeed_id = ids[0];
    }
    // build special tokens cache
    {
        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {