model: dbrx: convert fix tokenizer
This commit is contained in:
parent
305ac3b61b
commit
b6522a9f5b
1 changed files with 4 additions and 1 deletions
|
@ -1457,6 +1457,8 @@ class Qwen2MoeModel(Model):
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.get_vocab().items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
self.gguf_writer.add_chat_template(tokenizer.default_chat_template)
|
||||||
|
|
||||||
# REVIEW: Not tested yet, need to deep dive this tiktoken
|
# REVIEW: Not tested yet, need to deep dive this tiktoken
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
|
@ -1476,7 +1478,8 @@ class Qwen2MoeModel(Model):
|
||||||
self.gguf_writer.add_token_list(tokens)
|
self.gguf_writer.add_token_list(tokens)
|
||||||
self.gguf_writer.add_token_types(toktypes)
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
special_vocab = gguf.SpecialVocab(dir_model) # FIXME https://huggingface.co/databricks/dbrx-instruct/blob/main/tokenizer_config.json
|
||||||
|
special_vocab.merges = []
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue