convert : add support for XLMRoberta embedding models (#8658)

* add conversion for bge-m3; small fix in unigram tokenizer * clean up and simplify XLMRoberta conversion
2024-08-06 02:20:54 -05:00 · 2024-08-06 02:20:54 -05:00 · cdd1889de6
commit cdd1889de6
parent c21a896405
2 changed files with 110 additions and 1 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -816,6 +816,9 @@ struct llm_tokenizer_ugm {
     * the best tokenization.
    */
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+        // get current size of output (for reversal later)
+        size_t output_size = output.size();
+
        // normalize the input first
        std::string normalized;
        normalize(text, &normalized);
@ -895,7 +898,7 @@ struct llm_tokenizer_ugm {
        }

        // reverse the output since we added tokens starting from the end of the input
-        std::reverse(output.begin(), output.end());
+        std::reverse(output.begin() + output_size, output.end());
    }

 private: