convert : add support for XLMRoberta embedding models (#8658)
* add conversion for bge-m3; small fix in unigram tokenizer * clean up and simplify XLMRoberta conversion
This commit is contained in:
parent
c21a896405
commit
cdd1889de6
2 changed files with 110 additions and 1 deletions
|
@ -816,6 +816,9 @@ struct llm_tokenizer_ugm {
|
|||
* the best tokenization.
|
||||
*/
|
||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||
// get current size of output (for reversal later)
|
||||
size_t output_size = output.size();
|
||||
|
||||
// normalize the input first
|
||||
std::string normalized;
|
||||
normalize(text, &normalized);
|
||||
|
@ -895,7 +898,7 @@ struct llm_tokenizer_ugm {
|
|||
}
|
||||
|
||||
// reverse the output since we added tokens starting from the end of the input
|
||||
std::reverse(output.begin(), output.end());
|
||||
std::reverse(output.begin() + output_size, output.end());
|
||||
}
|
||||
|
||||
private:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue