fix tokenizer.json tokenizer_config.json cpu()

This commit is contained in:
Achazwl 2024-05-03 10:06:36 +08:00
parent 6c1c4b4688
commit 36bff51a7a

View file

@ -1,6 +1,6 @@
import argparse
import glob
import os
import os, json
import torch
from transformers import AutoModel, AutoTokenizer
@ -16,12 +16,12 @@ checkpoint = model.state_dict()
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
projector = {name: checkpoint[name].float().cpu() for name in mm_tensors}
torch.save(projector, f"{args.model}/llava.projector")
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]
if len(clip_tensors) > 0:
clip = {name.replace("vpm.", ""): checkpoint[name].float() for name in clip_tensors}
clip = {name.replace("vpm.", ""): checkpoint[name].float().cpu() for name in clip_tensors}
torch.save(clip, f"{args.model}/llava.clip")
# added tokens should be removed to be able to convert Mistral models
@ -42,6 +42,15 @@ model.llm.save_pretrained(f"{args.model}/MiniCPM")
tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
tok.save_pretrained(f"{args.model}/MiniCPM")
os.system(f"cp {args.model}/modeling_minicpm.py {args.model}/MiniCPM/modeling_minicpm.py")
os.system(f"cp {args.model}/tokenizer.json {args.model}/MiniCPM/tokenizer.json")
with open(f"{args.model}/MiniCPM/tokenizer_config.json", "r") as f:
d = json.load(f)
d.pop("auto_map")
d["tokenizer_class"] = "LlamaTokenizer"
d.pop("add_prefix_space")
with open(f"{args.model}/MiniCPM/tokenizer_config.json", "w") as f:
json.dump(d, f, indent=2)
print("Done!")
print(f"Now you can convert {args.model} to a regular LLaMA GGUF file.")