refactor: respect special token from metadata

when vocab.type is SPM, we will confirm the linefeed_id by searching the char, and use special_pad_id instead if not found. the special_*_id are usually record in metadata, to ensure the special_pad_id can be used correctly, we need to obtain it from metadata first and then perform the above confirmation logic. Signed-off-by: thxCode <thxcode0824@gmail.com>
2024-08-06 17:10:21 +08:00 · 2024-08-06 17:10:21 +08:00 · 0b90345749
commit 0b90345749
parent bb55b19c04
1 changed files with 80 additions and 80 deletions
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -5517,6 +5517,86 @@ static void llm_load_vocab(
    }
    GGML_ASSERT(vocab.id_to_token.size() == vocab.token_to_id.size());
    // special tokens
    {
      const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
          { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
          { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
          { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
          { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
          { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
          { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
          { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
          { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
          { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
          { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
          { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
          { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
      };
      for (const auto & it : special_token_types) {
        const std::string & key = kv(std::get<0>(it));
        int32_t & id = std::get<1>(it);
        uint32_t new_id;
        if (!ml.get_key(std::get<0>(it), new_id, false)) {
          continue;
        }
        if (new_id >= vocab.id_to_token.size()) {
          LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
                         __func__, key.c_str(), new_id, id);
        } else {
          id = new_id;
        }
      }
      // Handle add_bos_token and add_eos_token
      {
        bool temp = true;
        if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
          vocab.tokenizer_add_bos = temp;
        }
        if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
          vocab.tokenizer_add_eos = temp;
        }
      }
      // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
      //
      // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
      //       for now, we apply this workaround to find the EOT token based on its text
      if (vocab.special_eot_id == -1) {
        for (const auto & t : vocab.token_to_id) {
          if (
              // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
              //       need to fix convert script
              //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
              (t.first == "<|eot_id|>" ||
               t.first == "<|im_end|>" ||
               t.first == "<|end|>" ||
               t.first == "<end_of_turn>" ||
               t.first == "<|endoftext|>"
               )
          ) {
            vocab.special_eot_id = t.second;
            break;
          }
        }
      }
      // find EOM token: "<|eom_id|>"
      //
      // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
      //       for now, we apply this workaround to find the EOM token based on its text
      if (vocab.special_eom_id == -1) {
        const auto & t = vocab.token_to_id.find("<|eom_id|>");
        if (t != vocab.token_to_id.end()) {
          vocab.special_eom_id = t->second;
        }
      }
    }
    // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
    if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
        // For Fill-In-the-Middle (FIM)/infill models which where converted
@ -5571,86 +5651,6 @@ static void llm_load_vocab(
        vocab.linefeed_id = ids[0];
    }
    // special tokens
    {
        const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
            { LLM_KV_TOKENIZER_BOS_ID,    vocab.special_bos_id    },
            { LLM_KV_TOKENIZER_EOS_ID,    vocab.special_eos_id    },
            { LLM_KV_TOKENIZER_UNK_ID,    vocab.special_unk_id    },
            { LLM_KV_TOKENIZER_SEP_ID,    vocab.special_sep_id    },
            { LLM_KV_TOKENIZER_PAD_ID,    vocab.special_pad_id    },
            { LLM_KV_TOKENIZER_CLS_ID,    vocab.special_cls_id    },
            { LLM_KV_TOKENIZER_MASK_ID,   vocab.special_mask_id   },
            { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id },
            { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id },
            { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
            { LLM_KV_TOKENIZER_EOT_ID,    vocab.special_eot_id    },
            { LLM_KV_TOKENIZER_EOM_ID,    vocab.special_eom_id    },
        };
        for (const auto & it : special_token_types) {
            const std::string & key = kv(std::get<0>(it));
            int32_t & id = std::get<1>(it);
            uint32_t new_id;
            if (!ml.get_key(std::get<0>(it), new_id, false)) {
                continue;
            }
            if (new_id >= vocab.id_to_token.size()) {
                LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
                    __func__, key.c_str(), new_id, id);
            } else {
                id = new_id;
            }
        }
        // Handle add_bos_token and add_eos_token
        {
            bool temp = true;
            if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
                vocab.tokenizer_add_bos = temp;
            }
            if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
                vocab.tokenizer_add_eos = temp;
            }
        }
        // find EOT token: "<|eot_id|>", "<|im_end|>", "<end_of_turn>", etc.
        //
        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
        //       for now, we apply this workaround to find the EOT token based on its text
        if (vocab.special_eot_id == -1) {
            for (const auto & t : vocab.token_to_id) {
                if (
                        // TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
                        //       need to fix convert script
                        //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
                        (t.first == "<|eot_id|>" ||
                         t.first == "<|im_end|>" ||
                         t.first == "<|end|>" ||
                         t.first == "<end_of_turn>" ||
                         t.first == "<|endoftext|>"
                        )
                   ) {
                    vocab.special_eot_id = t.second;
                    break;
                }
            }
        }
        // find EOM token: "<|eom_id|>"
        //
        // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID
        //       for now, we apply this workaround to find the EOM token based on its text
        if (vocab.special_eom_id == -1) {
            const auto & t = vocab.token_to_id.find("<|eom_id|>");
            if (t != vocab.token_to_id.end()) {
                vocab.special_eom_id = t->second;
            }
        }
    }
    // build special tokens cache
    {
        for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {