convert-falcon-hf-to-gguf.py : fix special token mapping
This commit is contained in:
parent
7bbbf38c32
commit
9853f2cfb2
1 changed files with 12 additions and 27 deletions
|
@ -164,38 +164,23 @@ if Path(dir_model + "/tokenizer.json").is_file():
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
|
print("gguf: get special token ids")
|
||||||
print("gguf: get special token ids")
|
# Look for special tokens in config.json
|
||||||
|
|
||||||
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
|
if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
|
||||||
tokenizer_config = json.load(f)
|
gguf_writer.add_bos_token_id(hparams["bos_token_id"])
|
||||||
|
|
||||||
# find special token ids
|
if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
|
||||||
|
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
|
||||||
|
|
||||||
if "bos_token" in tokenizer_config:
|
if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_unk_token_id(hparams["unk_token_id"])
|
||||||
if key["content"] == tokenizer_config["bos_token"]:
|
|
||||||
gguf_writer.add_bos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "eos_token" in tokenizer_config:
|
if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_sep_token_id(hparams["sep_token_id"])
|
||||||
if key["content"] == tokenizer_config["eos_token"]:
|
|
||||||
gguf_writer.add_eos_token_id(key["id"])
|
|
||||||
|
|
||||||
if "unk_token" in tokenizer_config:
|
if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
|
||||||
for key in tokenizer_json["added_tokens"]:
|
gguf_writer.add_pad_token_id(hparams["pad_token_id"])
|
||||||
if key["content"] == tokenizer_config["unk_token"]:
|
|
||||||
gguf_writer.add_unk_token_id(key["id"])
|
|
||||||
|
|
||||||
if "sep_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["sep_token"]:
|
|
||||||
gguf_writer.add_sep_token_id(key["id"])
|
|
||||||
|
|
||||||
if "pad_token" in tokenizer_config:
|
|
||||||
for key in tokenizer_json["added_tokens"]:
|
|
||||||
if key["content"] == tokenizer_config["pad_token"]:
|
|
||||||
gguf_writer.add_pad_token_id(key["id"])
|
|
||||||
|
|
||||||
|
|
||||||
# TENSORS
|
# TENSORS
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue