From 8504d2d0da8cc7a1f2eee0e9e56949f960510b75 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 8 Feb 2024 09:46:47 +0200
Subject: [PATCH 1/6] tests : .gitignore obj files

---
 tests/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/.gitignore b/tests/.gitignore
index 092dce742..9427cf13d 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -1,3 +1,3 @@
 *
 !*.*
-test-c.o
+*.o

From 26d4efd11e48908e14e2ee9471a7fc4c57079a1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 8 Feb 2024 09:46:30 +0100
Subject: [PATCH 2/6] sampling: fix top_k <= 0 (#5388)

* sampling: fix top_k <= 0

* Update llama.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/sampling.cpp     | 2 +-
 llama.cpp               | 4 ++++
 tests/test-sampling.cpp | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/common/sampling.cpp b/common/sampling.cpp
index e8675a8c0..844ad7c53 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -132,7 +132,7 @@ static void sampler_queue(
     const float         temp              = params.temp;
     const float         dynatemp_range    = params.dynatemp_range;
     const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
+    const int32_t       top_k             = params.top_k;
     const float         top_p             = params.top_p;
     const float         min_p             = params.min_p;
     const float         tfs_z             = params.tfs_z;
diff --git a/llama.cpp b/llama.cpp
index c45ae1d50..f8f5796a4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8585,6 +8585,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
     // }
 
     const int64_t t_start_sample_us = ggml_time_us();
+    
+    if (k <= 0) {
+        k = candidates->size;
+    }
 
     k = std::max(k, (int) min_keep);
     k = std::min(k, (int) candidates->size);
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index c3b3d6629..6374958fe 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -235,6 +235,8 @@ int main(void) {
 
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
     test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
+    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
 
     test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
     test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);

From a6e514a85f0fda38ff78ec91782877ea3d19ed98 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 8 Feb 2024 09:58:19 +0100
Subject: [PATCH 3/6] llava: fix typo/formatting in README.md (#5405)

This commit fixes a typo in the README.md file for the llava example
which is causing the formatting to look a little off:

Clone llava-v15-7b`` and clip-vit-large-patch14-336`` locally

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/llava/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 323c5fdd0..295181a34 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -21,7 +21,7 @@ After building, run: `./llava-cli` to see the usage. For example:
 
 ## Model conversion
 
-- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally:
+- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
 
 ```sh
 git clone https://huggingface.co/liuhaotian/llava-v1.5-7b

From 4aa43fab569215a13495a7f1a0f8afc541b16d03 Mon Sep 17 00:00:00 2001
From: runfuture <runfuture@users.noreply.github.com>
Date: Thu, 8 Feb 2024 18:36:19 +0800
Subject: [PATCH 4/6] llama : fix MiniCPM (#5392)

* fix bug for norm_rms_eps missing

* to align with the same order as convert.py for model write

* fix: undo HF models permute tensor

* update for flake8 lint
---
 convert-hf-to-gguf.py | 63 +++++++++++++++++++++++++++++++++++++++++--
 llama.cpp             |  2 ++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 829d68368..0d4ea03b4 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1078,17 +1078,76 @@ class MiniCPMModel(Model):
         self.gguf_writer.add_name("MiniCPM")
         self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
         self.gguf_writer.add_block_count(block_count)
+        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
+        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
         self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
         self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
 
     def set_vocab(self):
         self._set_vocab_hf()
 
+    def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
+        if n_kv_head is not None and n_head != n_kv_head:
+            n_head //= n_kv_head
+
+        return (
+            weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape)
+        )
+
+    def write_tensors(self):
+        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        n_head = self.hparams.get("num_attention_heads")
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        for name, data_torch in self.get_tensors():
+            # we don't need these
+            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
+                continue
+
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            # HF models permute some of the tensors, so we need to undo that
+            if name.endswith(("q_proj.weight")):
+                data_torch = self._reverse_hf_permute(data_torch, n_head, n_head)
+            if name.endswith(("k_proj.weight")):
+                data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head)
+
+            data = data_torch.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
 
 class QwenModel(Model):
     @staticmethod
diff --git a/llama.cpp b/llama.cpp
index f8f5796a4..552e0d02e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2947,6 +2947,8 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_MINICPM:
             {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
                 switch (hparams.n_layer) {
                     case 40: model.type = e_model::MODEL_2B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;

From b7b74cef36a93ae01e0b9af8986d131761742d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 8 Feb 2024 11:36:54 +0100
Subject: [PATCH 5/6] fix trailing whitespace (#5407)

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 552e0d02e..89acafbc3 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8587,7 +8587,7 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can
     // }
 
     const int64_t t_start_sample_us = ggml_time_us();
-    
+
     if (k <= 0) {
         k = candidates->size;
     }

From ff4ff05c5ff4311c05a8ce1f984c7d8def4f07a5 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Thu, 8 Feb 2024 15:20:03 +0100
Subject: [PATCH 6/6] llava : add missing .py, and fix paths in README.md
 (#5414)

This commit adds the missing .py extension to the convert-image-encoder-to-gguf
script. It also fixes the paths for the `model` and `mmproj` options in the
example llava-cli command.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/llava/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/llava/README.md b/examples/llava/README.md
index 295181a34..721d5e613 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -14,7 +14,7 @@ Build with cmake or run `make llava-cli` to build it.
 After building, run: `./llava-cli` to see the usage. For example:
 
 ```sh
-./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -38,7 +38,7 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF:
 
 ```sh
-python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: