BERT tokenizer fixes (#6498)

Key changes: * BERT conversion: fix abuse of LlamaHfVocab, do not set BOS or EOS * Nomic Embed conversion: pad vocab instead of slicing embedding tensor * llama_tokenize: handle added special tokens like HF does
2024-04-09 13:44:08 -04:00 · 2024-04-09 13:44:08 -04:00 · 1b67731e18
commit 1b67731e18
parent c4a3a4ff47
20 changed files with 221 additions and 194 deletions
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -123,10 +123,10 @@ int main(int argc, char ** argv) {
        inputs.push_back(inp);
    }

-    // add eos if not present
+    // add SEP if not present
    for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
-            inp.push_back(llama_token_eos(model));
+        if (inp.empty() || inp.back() != llama_token_sep(model)) {
+            inp.push_back(llama_token_sep(model));
        }
    }