convert-hf : allow unusual model part names
For example, loading `model-00001-of-00001.safetensors` now works. * convert-hf : fix stacking MoE expert tensors `torch.stack` and `torch.cat` don't do the same thing. * convert-hf : fix Mamba conversion Tested to work even with a SentencePiece-based tokenizer.
This commit is contained in:
parent
56f60f5d69
commit
3870164f47
1 changed files with 30 additions and 41 deletions
|
@ -49,9 +49,8 @@ class Model(Protocol):
|
||||||
is_big_endian: bool
|
is_big_endian: bool
|
||||||
endianess: gguf.GGUFEndian
|
endianess: gguf.GGUFEndian
|
||||||
use_temp_file: bool
|
use_temp_file: bool
|
||||||
|
part_names: list[str]
|
||||||
is_safetensors: bool
|
is_safetensors: bool
|
||||||
num_parts: int
|
|
||||||
part_names: Iterable[str]
|
|
||||||
hparams: dict[str, Any]
|
hparams: dict[str, Any]
|
||||||
gguf_writer: gguf.GGUFWriter
|
gguf_writer: gguf.GGUFWriter
|
||||||
block_count: int
|
block_count: int
|
||||||
|
@ -67,9 +66,10 @@ class Model(Protocol):
|
||||||
self.is_big_endian = is_big_endian
|
self.is_big_endian = is_big_endian
|
||||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||||
self.use_temp_file = use_temp_file
|
self.use_temp_file = use_temp_file
|
||||||
self.is_safetensors = self._is_model_safetensors()
|
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
|
||||||
self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin")
|
self.is_safetensors = len(self.part_names) > 0
|
||||||
self.part_names = self._get_part_names()
|
if not self.is_safetensors:
|
||||||
|
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||||
|
@ -109,7 +109,7 @@ class Model(Protocol):
|
||||||
sys.exit()
|
sys.exit()
|
||||||
if "{bid}" in name:
|
if "{bid}" in name:
|
||||||
assert bid is not None
|
assert bid is not None
|
||||||
name = name.format(bid)
|
name = name.format(bid=bid)
|
||||||
return name + suffix
|
return name + suffix
|
||||||
|
|
||||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||||
|
@ -228,13 +228,13 @@ class Model(Protocol):
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def count_model_parts(dir_model: Path, prefix: str) -> int:
|
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
|
||||||
num_parts = 0
|
part_names: list[str] = []
|
||||||
for filename in os.listdir(dir_model):
|
for filename in os.listdir(dir_model):
|
||||||
if filename.endswith(prefix):
|
if filename.endswith(suffix):
|
||||||
num_parts += 1
|
part_names.append(filename)
|
||||||
|
|
||||||
return num_parts
|
return part_names
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_hparams(dir_model):
|
def load_hparams(dir_model):
|
||||||
|
@ -258,19 +258,6 @@ class Model(Protocol):
|
||||||
except KeyError:
|
except KeyError:
|
||||||
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
|
||||||
|
|
||||||
def _is_model_safetensors(self) -> bool:
|
|
||||||
return Model.count_model_parts(self.dir_model, ".safetensors") > 0
|
|
||||||
|
|
||||||
def _get_part_names(self) -> Iterable[str]:
|
|
||||||
if self.is_safetensors:
|
|
||||||
if self.num_parts == 1: # there's only one .safetensors file
|
|
||||||
return ("model.safetensors",)
|
|
||||||
return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1))
|
|
||||||
|
|
||||||
if self.num_parts == 1: # there's only one .bin file
|
|
||||||
return ("pytorch_model.bin",)
|
|
||||||
return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
|
|
||||||
|
|
||||||
# used for GPT-2 BPE and WordPiece vocabs
|
# used for GPT-2 BPE and WordPiece vocabs
|
||||||
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
def get_vocab_base(self) -> tuple[list[str], list[int], str]:
|
||||||
tokens: list[str] = []
|
tokens: list[str] = []
|
||||||
|
@ -446,7 +433,7 @@ class Model(Protocol):
|
||||||
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor()
|
tokenizer = SentencePieceProcessor()
|
||||||
tokenizer.LoadFromFile(tokenizer_path)
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
@ -1120,7 +1107,7 @@ class StableLMModel(Model):
|
||||||
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight"
|
||||||
datas.append(norms[ename])
|
datas.append(norms[ename])
|
||||||
del norms[ename]
|
del norms[ename]
|
||||||
data_torch = torch.cat(datas, dim=0)
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight"
|
||||||
new_name = self.map_tensor_name(merged_name)
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
@ -1204,7 +1191,7 @@ class LlamaModel(Model):
|
||||||
assert bid is not None
|
assert bid is not None
|
||||||
|
|
||||||
if self._experts is None:
|
if self._experts is None:
|
||||||
self._experts = [{} for _ in range(n_experts)]
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
self._experts[bid][name] = data_torch
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
@ -1220,7 +1207,7 @@ class LlamaModel(Model):
|
||||||
datas.append(self._experts[bid][ename])
|
datas.append(self._experts[bid][ename])
|
||||||
del self._experts[bid][ename]
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
data_torch = torch.cat(datas, dim=0)
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
||||||
|
|
||||||
|
@ -1267,7 +1254,7 @@ class GrokModel(Model):
|
||||||
assert bid is not None
|
assert bid is not None
|
||||||
|
|
||||||
if self._experts is None:
|
if self._experts is None:
|
||||||
self._experts = [{} for _ in range(n_experts)]
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
self._experts[bid][name] = data_torch
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
@ -1283,7 +1270,7 @@ class GrokModel(Model):
|
||||||
datas.append(self._experts[bid][ename])
|
datas.append(self._experts[bid][ename])
|
||||||
del self._experts[bid][ename]
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
data_torch = torch.cat(datas, dim=0)
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
|
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
|
||||||
|
|
||||||
|
@ -1484,7 +1471,7 @@ class Qwen2MoeModel(Model):
|
||||||
assert bid is not None
|
assert bid is not None
|
||||||
|
|
||||||
if self._experts is None:
|
if self._experts is None:
|
||||||
self._experts = [{} for _ in range(n_experts)]
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
self._experts[bid][name] = data_torch
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
@ -1500,7 +1487,7 @@ class Qwen2MoeModel(Model):
|
||||||
datas.append(self._experts[bid][ename])
|
datas.append(self._experts[bid][ename])
|
||||||
del self._experts[bid][ename]
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
data_torch = torch.cat(datas, dim=0)
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
|
||||||
|
|
||||||
|
@ -1604,7 +1591,7 @@ class Phi3MiniModel(Model):
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor()
|
tokenizer = SentencePieceProcessor()
|
||||||
tokenizer.LoadFromFile(tokenizer_path)
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
@ -1786,7 +1773,7 @@ class InternLM2Model(Model):
|
||||||
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor()
|
tokenizer = SentencePieceProcessor()
|
||||||
tokenizer.LoadFromFile(tokenizer_path)
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
tokenizer.serialized_model_proto
|
tokenizer.serialized_model_proto
|
||||||
|
|
||||||
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
@ -2171,13 +2158,15 @@ class MambaModel(Model):
|
||||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||||
del n_dims # unused
|
del n_dims # unused
|
||||||
|
|
||||||
return new_name in (self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
return bid is not None and new_name in (
|
||||||
gguf.MODEL_TENSOR.SSM_CONV1D,
|
self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [
|
||||||
gguf.MODEL_TENSOR.SSM_X,
|
gguf.MODEL_TENSOR.SSM_CONV1D,
|
||||||
gguf.MODEL_TENSOR.SSM_DT,
|
gguf.MODEL_TENSOR.SSM_X,
|
||||||
gguf.MODEL_TENSOR.SSM_A,
|
gguf.MODEL_TENSOR.SSM_DT,
|
||||||
gguf.MODEL_TENSOR.SSM_D,
|
gguf.MODEL_TENSOR.SSM_A,
|
||||||
])
|
gguf.MODEL_TENSOR.SSM_D,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("CohereForCausalLM")
|
@Model.register("CohereForCausalLM")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue