llama : fix Roberta embeddings (#10856)
* fix: Use gpt2 tokenizer for roberta and add eos/bos tokens Branch: RobertaTokenizer Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fixes to position embeddings Signed-off-by: Sukriti-Sharma4 <sukriti.sharma4@ibm.com> * map roberta-bpe to gpt-2 Signed-off-by: Sukriti-Sharma4 <sukriti.sharma4@ibm.com> * fix linting Signed-off-by: Sukriti-Sharma4 <sukriti.sharma4@ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Signed-off-by: Sukriti-Sharma4 <sukriti.sharma4@ibm.com> Co-authored-by: Gabe Goodhart <ghart@us.ibm.com>
This commit is contained in:
parent
7585edbdeb
commit
2fffc52b50
2 changed files with 48 additions and 2 deletions
|
@ -6592,7 +6592,8 @@ static void llm_load_vocab(
|
|||
tokenizer_pre == "jina-v1-en" ||
|
||||
tokenizer_pre == "jina-v2-es" ||
|
||||
tokenizer_pre == "jina-v2-de" ||
|
||||
tokenizer_pre == "jina-v2-code") {
|
||||
tokenizer_pre == "jina-v2-code" ||
|
||||
tokenizer_pre == "roberta-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "refact") {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue