map roberta-bpe to gpt-2

Signed-off-by: Sukriti-Sharma4 <sukriti.sharma4@ibm.com>
This commit is contained in:
Sukriti-Sharma4 2024-12-18 18:37:00 -07:00
parent d5f69e8a43
commit 334ddfd97d
2 changed files with 3 additions and 6 deletions

View file

@ -663,11 +663,7 @@ class Model:
res = "minerva-7b" res = "minerva-7b"
if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65":
# ref: https://huggingface.co/sentence-transformers/stsb-roberta-base # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base
# NOTE: The Roberta tokenizer is the same as GPT-2, but it always res = "roberta-bpe"
# adds the cls/sep tokens as bos/eos. This is handled as a
# post-processor in tokenizers, so the chkhsh is different, but
# it still maps to gpt-2 internally.
res = "gpt-2"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")

View file

@ -6406,7 +6406,8 @@ static void llm_load_vocab(
tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v1-en" ||
tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "jina-v2-code") { tokenizer_pre == "jina-v2-code" ||
tokenizer_pre == "roberta-bpe") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if ( } else if (
tokenizer_pre == "refact") { tokenizer_pre == "refact") {