diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 3318be35c..5eee32016 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1913,6 +1913,11 @@ class MambaModel(Model): def write_tensors(self): block_count = self.hparams["n_layer"] tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + + tok_embd = None + tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" + output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + for name, data_torch in self.get_tensors(): old_dtype = data_torch.dtype @@ -1930,6 +1935,14 @@ class MambaModel(Model): print("A_log --> A ==> " + new_name) data_torch = -torch.exp(data_torch) + # assuming token_embd.weight is seen before output.weight + if tok_embd is not None and new_name == output_name: + if torch.equal(tok_embd, data_torch): + print(f"{output_name} is equivalent to {tok_embd_name}, omitting") + continue + if new_name == tok_embd_name: + tok_embd = data_torch + data = data_torch.squeeze().numpy() n_dims = len(data.shape)