convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires workarounds to work correctly.
This commit is contained in:
parent
fa358e7071
commit
38913dc8dd
1 changed files with 3 additions and 3 deletions
|
@ -2801,13 +2801,13 @@ class Mamba2Model(Model):
|
||||||
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
|
||||||
self.hparams["vocab_size"] = vocab_size
|
self.hparams["vocab_size"] = vocab_size
|
||||||
|
|
||||||
if (self.dir_model / "tokenizer.json").is_file():
|
if (self.dir_model / "tokenizer.model").is_file():
|
||||||
self._set_vocab_gpt2()
|
|
||||||
elif (self.dir_model / "tokenizer.model").is_file():
|
|
||||||
self._set_vocab_sentencepiece()
|
self._set_vocab_sentencepiece()
|
||||||
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
elif (self.dir_model / "tokenizer.model.v3").is_file():
|
||||||
# mamba-codestral
|
# mamba-codestral
|
||||||
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
raise NotImplementedError(f"Please rename {self.dir_model / 'tokenizer.model.v3'} to {self.dir_model / 'tokenizer.model'}")
|
||||||
|
elif (self.dir_model / "tokenizer.json").is_file():
|
||||||
|
self._set_vocab_gpt2()
|
||||||
else:
|
else:
|
||||||
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
# Use the GPT-NeoX tokenizer when no tokenizer files are present
|
||||||
self._set_vocab_builtin("gpt-neox", vocab_size)
|
self._set_vocab_builtin("gpt-neox", vocab_size)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue