Update convert.py
This commit is contained in:
parent
28f09beb60
commit
4adb8b9862
1 changed files with 42 additions and 11 deletions
53
convert.py
53
convert.py
|
@ -310,6 +310,8 @@ class VocabLoader:
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
|
self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
|
||||||
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
|
vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
|
||||||
|
|
||||||
|
self.added_tokens_list = []
|
||||||
|
self.vocab_size_base: int = len(self.tokenizer.vocab)
|
||||||
self.vocab_size: int = len(self.tokenizer.vocab)
|
self.vocab_size: int = len(self.tokenizer.vocab)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
@ -317,15 +319,21 @@ class VocabLoader:
|
||||||
tokenizer = self.tokenizer
|
tokenizer = self.tokenizer
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
|
||||||
for i in range(self.vocab_size):
|
for i in range(self.vocab_size_base):
|
||||||
text = reverse_vocab[i].encode("utf-8")
|
text = reverse_vocab[i].encode("utf-8")
|
||||||
yield text, 0.0, gguf.TokenType.NORMAL
|
yield text, 0.0, gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
score = -1000.0
|
||||||
|
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
def has_newline_token(self):
|
def has_newline_token(self):
|
||||||
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
yield from self.hf_tokens()
|
yield from self.hf_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def get_vocab_type(self) -> str:
|
def get_vocab_type(self) -> str:
|
||||||
path_candidates = []
|
path_candidates = []
|
||||||
|
@ -344,7 +352,7 @@ class VocabLoader:
|
||||||
vocab_file = "tokenizer.json"
|
vocab_file = "tokenizer.json"
|
||||||
path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
|
path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
|
||||||
if path_candidate:
|
if path_candidate:
|
||||||
if not self.tokenizer.can_save_slow_tokenizer():
|
if not self.has_newline_token():
|
||||||
return "gpt2"
|
return "gpt2"
|
||||||
else:
|
else:
|
||||||
return "llama"
|
return "llama"
|
||||||
|
@ -355,7 +363,7 @@ class VocabLoader:
|
||||||
"if it's in another directory, pass the directory as --vocab-dir")
|
"if it's in another directory, pass the directory as --vocab-dir")
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<VocabLoader with {self.vocab_size} tokens>"
|
return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
Vocab: TypeAlias = 'VocabLoader'
|
Vocab: TypeAlias = 'VocabLoader'
|
||||||
|
|
||||||
|
@ -728,17 +736,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
if params.n_vocab == vocab.vocab_size:
|
if params.n_vocab == vocab.vocab_size:
|
||||||
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
print("Ignoring added_tokens.json since model matches vocab size without it.")
|
||||||
vocab.added_tokens_list = []
|
vocab.added_tokens_list = []
|
||||||
vocab.vocab_size = vocab.vocab_size
|
vocab.vocab_size = vocab.vocab_size
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||||
|
pad_count = params.n_vocab - vocab.vocab_size
|
||||||
|
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||||
|
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||||
|
vocab.added_tokens_list.append(f'<dummy{i:05}>')
|
||||||
|
vocab.vocab_size = params.n_vocab
|
||||||
|
return
|
||||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||||
msg += f" has {vocab.vocab_size})."
|
msg += f" has {vocab.vocab_size})."
|
||||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
|
||||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||||
|
if vocab.vocab_size < params.n_vocab:
|
||||||
|
msg += " Possibly try using the --padvocab option."
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@ -812,8 +830,12 @@ class OutputFile:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(
|
||||||
check_vocab_size(params, vocab)
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab: bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
|
@ -840,8 +862,14 @@ class OutputFile:
|
||||||
return dt.quantize(arr)
|
return dt.quantize(arr)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
|
def write_all(
|
||||||
check_vocab_size(params, vocab)
|
fname_out : Path, ftype: GGMLFileType, params: Params,
|
||||||
|
model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
concurrency: int = DEFAULT_CONCURRENCY,
|
||||||
|
endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab : bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
|
@ -1054,6 +1082,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||||
|
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
|
@ -1101,7 +1130,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
load_merges = True,
|
load_merges = True,
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
|
endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1127,7 +1157,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
|
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue