clean up convert scripts

This commit is contained in:
Phillip Kravtsov 2023-09-28 22:16:59 -07:00
parent d0a7143f71
commit fa92f6e827
2 changed files with 17 additions and 22 deletions

View file

@ -21,7 +21,7 @@ def file_is_safetensors(path: Path) -> bool:
def get_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: get sentencepiece tokenizer from', tokenizer_path)
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
tokens: list[bytes] = []
scores: list[float] = []
@ -55,20 +55,20 @@ def get_tokenizer_info(dir_model: Path):
return tokens, scores, toktypes
def main(args_in: list[str] | None = None) -> None:
def get_args():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--outtype", choices=["f32"], help="currently only support fp32")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.safetensors)")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
args = parser.parse_args(args_in)
args = parser.parse_args()
return args
def main() -> None:
args = get_args()
assert file_is_safetensors(args.model), 'Error: model file is not a SafeTensors file'
dir_model = args.model.parent
with open(dir_model / 'config.json', 'r') as f:
hparams = json.load(f)
pprint(hparams)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
@ -88,14 +88,14 @@ def main(args_in: list[str] | None = None) -> None:
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams['rotary_emb_base'])
gguf_writer.add_layer_norm_eps(hparams['layernorm_epsilon'])
if True:
tokens, scores, toktypes = get_tokenizer_info(dir_model)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tokens, scores, toktypes = get_tokenizer_info(dir_model)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
tensors = {}

View file

@ -439,7 +439,7 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
n_head //= n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
@ -701,18 +701,13 @@ class LazyUnpickler(pickle.Unpickler):
def find_class(self, module: str, name: str) -> Any:
if not module.startswith('torch'):
return super().find_class(module, name)
if (module, name) in self.CLASSES:
return self.CLASSES[(module, name)]
else:
print(f'Missing mapping for {module}.{name}')
raise KeyError
return self.CLASSES[(module, name)]
def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
zf = zipfile.ZipFile(outer_fp)
pickle_paths = [name for name in zf.namelist() if name.endswith('.pkl')]
assert len(pickle_paths) == 1, pickle_paths
print(pickle_paths)
pickle_fp = zf.open(pickle_paths[0], 'r')
unpickler = LazyUnpickler(pickle_fp,
data_base_path=pickle_paths[0][:-4],