convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually. Most Mamba models actually share these two tensors, albeit implicitly.
This commit is contained in:
parent
1c8ea55843
commit
d0d32dced9
1 changed files with 13 additions and 0 deletions
|
@ -1913,6 +1913,11 @@ class MambaModel(Model):
|
||||||
def write_tensors(self):
|
def write_tensors(self):
|
||||||
block_count = self.hparams["n_layer"]
|
block_count = self.hparams["n_layer"]
|
||||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||||
|
|
||||||
|
tok_embd = None
|
||||||
|
tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
|
||||||
|
output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight"
|
||||||
|
|
||||||
for name, data_torch in self.get_tensors():
|
for name, data_torch in self.get_tensors():
|
||||||
old_dtype = data_torch.dtype
|
old_dtype = data_torch.dtype
|
||||||
|
|
||||||
|
@ -1930,6 +1935,14 @@ class MambaModel(Model):
|
||||||
print("A_log --> A ==> " + new_name)
|
print("A_log --> A ==> " + new_name)
|
||||||
data_torch = -torch.exp(data_torch)
|
data_torch = -torch.exp(data_torch)
|
||||||
|
|
||||||
|
# assuming token_embd.weight is seen before output.weight
|
||||||
|
if tok_embd is not None and new_name == output_name:
|
||||||
|
if torch.equal(tok_embd, data_torch):
|
||||||
|
print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
|
||||||
|
continue
|
||||||
|
if new_name == tok_embd_name:
|
||||||
|
tok_embd = data_torch
|
||||||
|
|
||||||
data = data_torch.squeeze().numpy()
|
data = data_torch.squeeze().numpy()
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
n_dims = len(data.shape)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue