converter scrypt fixes
This commit is contained in:
parent
e0504d536c
commit
0c69016171
1 changed files with 22 additions and 60 deletions
82
convert.py
82
convert.py
|
@ -332,6 +332,9 @@ class Params:
|
||||||
#
|
#
|
||||||
|
|
||||||
class BpeVocab:
|
class BpeVocab:
|
||||||
|
tokenizer_model = "gpt2"
|
||||||
|
name = "bpe"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||||
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
||||||
|
@ -385,17 +388,14 @@ class BpeVocab:
|
||||||
yield from self.bpe_tokens()
|
yield from self.bpe_tokens()
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def get_tokenizer_model(self) -> str:
|
|
||||||
return "gpt2"
|
|
||||||
|
|
||||||
def get_name(self) -> str:
|
|
||||||
return "bpe"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab:
|
class SentencePieceVocab:
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "spm"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
|
@ -454,17 +454,14 @@ class SentencePieceVocab:
|
||||||
yield from self.sentencepiece_tokens()
|
yield from self.sentencepiece_tokens()
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def get_tokenizer_model(self) -> str:
|
|
||||||
return "llama"
|
|
||||||
|
|
||||||
def get_name(self) -> str:
|
|
||||||
return "spm"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class HfVocab:
|
class HfVocab:
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "hfft"
|
||||||
|
|
||||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
||||||
try:
|
try:
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -561,22 +558,13 @@ class HfVocab:
|
||||||
yield from self.hf_tokens()
|
yield from self.hf_tokens()
|
||||||
yield from self.added_tokens()
|
yield from self.added_tokens()
|
||||||
|
|
||||||
def get_tokenizer_model(self) -> str:
|
|
||||||
return "llama"
|
|
||||||
|
|
||||||
def get_name(self) -> str:
|
|
||||||
return "hfft"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class NoVocab:
|
class NoVocab:
|
||||||
def get_tokenizer_model(self) -> str:
|
tokenizer_model = "no_vocab"
|
||||||
return "no_vocab"
|
name = "no_vocab"
|
||||||
|
|
||||||
def get_name(self) -> str:
|
|
||||||
return "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
@ -969,8 +957,9 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
|
|
||||||
|
|
||||||
def prepare_vocab(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
def prepare_vocab(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
|
||||||
assert not isinstance(vocab, NoVocab)
|
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
if vocab.name == "no_vocab":
|
||||||
|
return
|
||||||
|
|
||||||
# Check for a vocab size mismatch
|
# Check for a vocab size mismatch
|
||||||
if params.n_vocab == vocab.vocab_size:
|
if params.n_vocab == vocab.vocab_size:
|
||||||
|
@ -1065,7 +1054,7 @@ class OutputFile:
|
||||||
|
|
||||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||||
# Ensure that tokenizer_model is added to the GGUF model
|
# Ensure that tokenizer_model is added to the GGUF model
|
||||||
self.gguf.add_tokenizer_model(vocab.get_tokenizer_model())
|
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
|
||||||
# Extract model vocabulary for model conversion
|
# Extract model vocabulary for model conversion
|
||||||
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
||||||
|
@ -1158,34 +1147,11 @@ class OutputFile:
|
||||||
|
|
||||||
# meta data
|
# meta data
|
||||||
of.add_meta_arch(params)
|
of.add_meta_arch(params)
|
||||||
of.add_meta_vocab(vocab)
|
if vocab.name == "no_vocab":
|
||||||
of.add_meta_special_vocab(svocab)
|
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||||
|
else:
|
||||||
# tensor info
|
of.add_meta_vocab(vocab)
|
||||||
for name, lazy_tensor in model.items():
|
of.add_meta_special_vocab(svocab)
|
||||||
of.add_tensor_info(name, lazy_tensor)
|
|
||||||
|
|
||||||
of.write_meta()
|
|
||||||
of.write_tensor_info()
|
|
||||||
|
|
||||||
# tensor data
|
|
||||||
of.write_tensor_data(ftype, model, concurrency)
|
|
||||||
|
|
||||||
of.close()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def write_without_vocab(
|
|
||||||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab,
|
|
||||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
|
||||||
) -> None:
|
|
||||||
assert isinstance(vocab, NoVocab)
|
|
||||||
check_vocab_size(params, vocab)
|
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
|
||||||
|
|
||||||
# meta data
|
|
||||||
of.add_meta_arch(params)
|
|
||||||
of.gguf.add_tokenizer_model(vocab.get_tokenizer_model())
|
|
||||||
|
|
||||||
# tensor info
|
# tensor info
|
||||||
for name, lazy_tensor in model.items():
|
for name, lazy_tensor in model.items():
|
||||||
|
@ -1357,7 +1323,7 @@ class VocabFactory:
|
||||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
||||||
|
|
||||||
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
load_merges = vocab.get_name() == "bpe"
|
load_merges = vocab.name == "bpe"
|
||||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
||||||
return gguf.SpecialVocab(
|
return gguf.SpecialVocab(
|
||||||
model_parent_path,
|
model_parent_path,
|
||||||
|
@ -1513,12 +1479,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
if not args.no_vocab:
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
|
||||||
else:
|
|
||||||
OutputFile.write_without_vocab(outfile, ftype, params, model, vocab,
|
|
||||||
concurrency=args.concurrency, endianess=endianess)
|
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue