mpt : remove unused tokenizer_json in convert script
This commit is contained in:
parent
1a454eb561
commit
32172f12f5
1 changed files with 4 additions and 13 deletions
|
@ -111,24 +111,15 @@ tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
scores: list[float] = []
|
||||||
toktypes: list[int] = []
|
toktypes: list[int] = []
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
|
||||||
if not tokenizer_json_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# gpt2 tokenizer
|
# gpt2 tokenizer
|
||||||
gguf_writer.add_tokenizer_model("gpt2")
|
gguf_writer.add_tokenizer_model("gpt2")
|
||||||
|
|
||||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get gpt2 tokenizer vocab")
|
print("gguf: get gpt2 tokenizer vocab")
|
||||||
|
|
||||||
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]),
|
# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
|
||||||
# but there are only 50254 (len(tokenizer_json["model"]["vocab"]))
|
# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
|
||||||
# tokens in the vocab, presumably to accomodate some "reserved" tokens;
|
# accomodate some "reserved" tokens; this is causing problems down the line in
|
||||||
# this is causing problems down the line in llama.cpp, so we pad the vocab
|
# llama.cpp, so we pad the vocab with dummy tokens:
|
||||||
# with dummy tokens:
|
|
||||||
|
|
||||||
vocab_size = hparams["vocab_size"]
|
vocab_size = hparams["vocab_size"]
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue