feat: Add prototype for identifying the vocab type
This commit is contained in:
parent
dcc5d4241d
commit
c6f2a48af7
1 changed files with 14 additions and 8 deletions
|
@ -386,13 +386,10 @@ class Model:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||||
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
assert tokenizer.vocab_size == vocab_size
|
||||||
|
|
||||||
tokpre = self.get_vocab_base_pre(tokenizer)
|
|
||||||
|
|
||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
tokens.append(f"[PAD{i}]")
|
tokens.append(f"[PAD{i}]")
|
||||||
|
@ -407,6 +404,7 @@ class Model:
|
||||||
tokens.append(reverse_vocab[i])
|
tokens.append(reverse_vocab[i])
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
|
tokpre = self.get_vocab_base_pre(tokenizer)
|
||||||
return tokens, toktypes, tokpre
|
return tokens, toktypes, tokpre
|
||||||
|
|
||||||
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
# NOTE: this function is generated by convert-hf-to-gguf-update.py
|
||||||
|
@ -418,7 +416,6 @@ class Model:
|
||||||
# is specific for the BPE pre-tokenizer used by the model
|
# is specific for the BPE pre-tokenizer used by the model
|
||||||
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
|
||||||
# use in llama.cpp to implement the same pre-tokenizer
|
# use in llama.cpp to implement the same pre-tokenizer
|
||||||
|
|
||||||
checksum = sha256(str(tokenizer.vocab).encode()).hexdigest()
|
checksum = sha256(str(tokenizer.vocab).encode()).hexdigest()
|
||||||
logger.debug(f"checksum: {checksum}")
|
logger.debug(f"checksum: {checksum}")
|
||||||
|
|
||||||
|
@ -427,12 +424,21 @@ class Model:
|
||||||
# Run the `gguf-py/scripts/gguf-gen-pre.py` script to generate the checksums.
|
# Run the `gguf-py/scripts/gguf-gen-pre.py` script to generate the checksums.
|
||||||
# This script should ideally pull in the latest version of the model from HuggingFace.
|
# This script should ideally pull in the latest version of the model from HuggingFace.
|
||||||
# DO NOT MANUALLY EDIT THIS METHOD!
|
# DO NOT MANUALLY EDIT THIS METHOD!
|
||||||
models = json.load("models/checksums.json")
|
models = json.load(f"{tokenizer.name_or_path}/checksums.json")
|
||||||
for model in models:
|
for model in models:
|
||||||
if checksum == model["checksum"]:
|
if checksum == model["checksum"]:
|
||||||
logger.debug(f"tokenizer.ggml.pre: {repr(model['repo'])}")
|
pre = None
|
||||||
|
if model["tokt"] == gguf.TokenizerType.BPE:
|
||||||
|
pre = "bpe"
|
||||||
|
elif model["tokt"] == gguf.TokenizerType.SPM:
|
||||||
|
pre = "spm"
|
||||||
|
elif model["tokt"] == gguf.TokenizerType.WPM:
|
||||||
|
pre = "wpm"
|
||||||
|
else:
|
||||||
|
raise KeyError()
|
||||||
logger.debug(f"tokenizer checksum: {checksum}")
|
logger.debug(f"tokenizer checksum: {checksum}")
|
||||||
return model["tokt"] # NOTE: Use the enum to id the vocab
|
logger.debug(f"tokenizer.ggml.pre: {pre}")
|
||||||
|
return pre # NOTE: Use the enum to id the vocab
|
||||||
|
|
||||||
logger.warning("\n")
|
logger.warning("\n")
|
||||||
logger.warning("**************************************************************************************")
|
logger.warning("**************************************************************************************")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue