convert : add support for XLMRoberta embedding models (#8658)

* add conversion for bge-m3; small fix in unigram tokenizer

* clean up and simplify XLMRoberta conversion
This commit is contained in:
Douglas Hanley 2024-08-06 02:20:54 -05:00 committed by GitHub
parent c21a896405
commit cdd1889de6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 110 additions and 1 deletions

View file

@ -816,6 +816,9 @@ struct llm_tokenizer_ugm {
* the best tokenization.
*/
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// get current size of output (for reversal later)
size_t output_size = output.size();
// normalize the input first
std::string normalized;
normalize(text, &normalized);
@ -895,7 +898,7 @@ struct llm_tokenizer_ugm {
}
// reverse the output since we added tokens starting from the end of the input
std::reverse(output.begin(), output.end());
std::reverse(output.begin() + output_size, output.end());
}
private: