try to make tokenizer work

This commit is contained in:
vincent 2024-02-06 00:10:27 +08:00
parent ec4507a99c
commit 36084596c1

View file

@ -22,6 +22,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
from convert import HfVocab
# check for any of the given keys in the dictionary and return the value of the first key found # check for any of the given keys in the dictionary and return the value of the first key found
def get_key_opts(d, keys): def get_key_opts(d, keys):
@ -406,6 +407,31 @@ class Model:
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_hf(self):
path = self.dir_model
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
class GPTNeoXModel(Model): class GPTNeoXModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
@ -1058,7 +1084,7 @@ class MiniCPMModel(Model):
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
def set_vocab(self): def set_vocab(self):
self._set_vocab_sentencepiece() self._set_vocab_hf()
class QwenModel(Model): class QwenModel(Model):
@staticmethod @staticmethod