Add support for loading merges.txt
Add --padvocab option to convert.py Other minor cleanups
This commit is contained in:
parent
2833a6f63c
commit
fb8e2fe606
3 changed files with 62 additions and 8 deletions
36
convert.py
36
convert.py
|
@ -779,7 +779,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
if params.n_vocab == vocab.vocab_size_base:
|
if params.n_vocab == vocab.vocab_size_base:
|
||||||
|
@ -787,12 +787,21 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
vocab.added_tokens_list = []
|
vocab.added_tokens_list = []
|
||||||
vocab.vocab_size = vocab.vocab_size_base
|
vocab.vocab_size = vocab.vocab_size_base
|
||||||
return
|
return
|
||||||
|
if pad_vocab and params.n_vocab > vocab.vocab_size:
|
||||||
|
pad_count = params.n_vocab - vocab.vocab_size
|
||||||
|
print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
|
||||||
|
for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
|
||||||
|
vocab.added_tokens_list.append(f'<dummy{i:05}>')
|
||||||
|
vocab.vocab_size = params.n_vocab
|
||||||
|
return
|
||||||
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
|
||||||
if vocab.fname_added_tokens is not None:
|
if vocab.fname_added_tokens is not None:
|
||||||
msg += f" combined with {vocab.fname_added_tokens}"
|
msg += f" combined with {vocab.fname_added_tokens}"
|
||||||
msg += f" has {vocab.vocab_size})."
|
msg += f" has {vocab.vocab_size})."
|
||||||
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
|
if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
|
||||||
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
|
||||||
|
if vocab.vocab_size < params.n_vocab:
|
||||||
|
msg += " Possibly try using the --padvocab option."
|
||||||
raise Exception(msg)
|
raise Exception(msg)
|
||||||
|
|
||||||
|
|
||||||
|
@ -877,8 +886,12 @@ class OutputFile:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(
|
||||||
check_vocab_size(params, vocab)
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab: bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
|
@ -905,8 +918,14 @@ class OutputFile:
|
||||||
return dt.quantize(arr)
|
return dt.quantize(arr)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
|
def write_all(
|
||||||
check_vocab_size(params, vocab)
|
fname_out : Path, ftype: GGMLFileType, params: Params,
|
||||||
|
model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
|
concurrency: int = DEFAULT_CONCURRENCY,
|
||||||
|
endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
|
pad_vocab : bool = False,
|
||||||
|
) -> None:
|
||||||
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
|
@ -1126,6 +1145,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
||||||
|
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||||
|
|
||||||
args = parser.parse_args(args_in)
|
args = parser.parse_args(args_in)
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
|
@ -1173,7 +1193,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
|
endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -1196,7 +1217,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
|
concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1023,6 +1023,35 @@ class SpecialVocab:
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
if not self._try_load_from_tokenizer_json(path):
|
if not self._try_load_from_tokenizer_json(path):
|
||||||
self._try_load_from_config_json(path)
|
self._try_load_from_config_json(path)
|
||||||
|
if self.load_merges and len(self.merges) == 0:
|
||||||
|
self._try_load_merges_txt(path)
|
||||||
|
|
||||||
|
def _try_load_merges_txt(self, path: Path) -> bool:
|
||||||
|
merges_file = path / 'merges.txt'
|
||||||
|
if not merges_file.is_file():
|
||||||
|
return False
|
||||||
|
with open(merges_file, 'r') as fp:
|
||||||
|
first_line = next(fp, '').strip()
|
||||||
|
if not first_line.startswith('#'):
|
||||||
|
fp.seek(0)
|
||||||
|
line_num = 0
|
||||||
|
else:
|
||||||
|
line_num = 1
|
||||||
|
merges = []
|
||||||
|
for line in fp:
|
||||||
|
line_num += 1
|
||||||
|
line = line.strip()
|
||||||
|
if len(line) == 0:
|
||||||
|
continue
|
||||||
|
parts = line.split(None, 3)
|
||||||
|
if len(parts) != 2:
|
||||||
|
print(f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
|
||||||
|
file = sys.stderr)
|
||||||
|
continue
|
||||||
|
merges.append(f'{parts[0]} {parts[1]}')
|
||||||
|
self.merges = merges
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _set_special_token(self, typ: str, tid: Any):
|
def _set_special_token(self, typ: str, tid: Any):
|
||||||
if not isinstance(tid, int) or tid < 0:
|
if not isinstance(tid, int) or tid < 0:
|
||||||
|
@ -1083,6 +1112,9 @@ class SpecialVocab:
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||||
gw.add_token_merges(self.merges)
|
gw.add_token_merges(self.merges)
|
||||||
|
elif self.load_merges:
|
||||||
|
print('gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
|
||||||
|
file = sys.stderr)
|
||||||
for typ, tokid in self.special_token_ids.items():
|
for typ, tokid in self.special_token_ids.items():
|
||||||
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
|
||||||
if handler is None:
|
if handler is None:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.4.5"
|
version = "0.4.6"
|
||||||
description = "Write ML models in GGUF for GGML"
|
description = "Write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue