Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)

* Update brute force test: add_special
* Update brute force test: default values for add_bos_token and add_eos_token
* Enable rtrim when pre-inserting BOS

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "server : fix test regexes"
This commit is contained in:
jaime-m-p 2024-05-21 14:39:48 +02:00 committed by GitHub
parent 917dc8cfa6
commit d7e852c1bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 28 additions and 23 deletions

View file

@ -1749,7 +1749,7 @@ class Phi3MiniModel(Model):
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@ -1765,7 +1765,7 @@ class Phi3MiniModel(Model):
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED