From 8504d2d0da8cc7a1f2eee0e9e56949f960510b75 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 8 Feb 2024 09:46:47 +0200 Subject: [PATCH 1/6] tests : .gitignore obj files --- tests/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/.gitignore b/tests/.gitignore index 092dce742..9427cf13d 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1,3 +1,3 @@ * !*.* -test-c.o +*.o From 26d4efd11e48908e14e2ee9471a7fc4c57079a1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 8 Feb 2024 09:46:30 +0100 Subject: [PATCH 2/6] sampling: fix top_k <= 0 (#5388) * sampling: fix top_k <= 0 * Update llama.cpp Co-authored-by: Georgi Gerganov --------- Co-authored-by: Georgi Gerganov --- common/sampling.cpp | 2 +- llama.cpp | 4 ++++ tests/test-sampling.cpp | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/common/sampling.cpp b/common/sampling.cpp index e8675a8c0..844ad7c53 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -132,7 +132,7 @@ static void sampler_queue( const float temp = params.temp; const float dynatemp_range = params.dynatemp_range; const float dynatemp_exponent = params.dynatemp_exponent; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const int32_t top_k = params.top_k; const float top_p = params.top_p; const float min_p = params.min_p; const float tfs_z = params.tfs_z; diff --git a/llama.cpp b/llama.cpp index c45ae1d50..f8f5796a4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8585,6 +8585,10 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can // } const int64_t t_start_sample_us = ggml_time_us(); + + if (k <= 0) { + k = candidates->size; + } k = std::max(k, (int) min_keep); k = std::min(k, (int) candidates->size); diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index c3b3d6629..6374958fe 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -235,6 +235,8 @@ int main(void) { test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4); + test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); From a6e514a85f0fda38ff78ec91782877ea3d19ed98 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 8 Feb 2024 09:58:19 +0100 Subject: [PATCH 3/6] llava: fix typo/formatting in README.md (#5405) This commit fixes a typo in the README.md file for the llava example which is causing the formatting to look a little off: Clone llava-v15-7b`` and clip-vit-large-patch14-336`` locally Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 323c5fdd0..295181a34 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -21,7 +21,7 @@ After building, run: `./llava-cli` to see the usage. For example: ## Model conversion -- Clone `llava-v15-7b`` and `clip-vit-large-patch14-336`` locally: +- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally: ```sh git clone https://huggingface.co/liuhaotian/llava-v1.5-7b From 4aa43fab569215a13495a7f1a0f8afc541b16d03 Mon Sep 17 00:00:00 2001 From: runfuture Date: Thu, 8 Feb 2024 18:36:19 +0800 Subject: [PATCH 4/6] llama : fix MiniCPM (#5392) * fix bug for norm_rms_eps missing * to align with the same order as convert.py for model write * fix: undo HF models permute tensor * update for flake8 lint --- convert-hf-to-gguf.py | 63 +++++++++++++++++++++++++++++++++++++++++-- llama.cpp | 2 ++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 829d68368..0d4ea03b4 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1078,17 +1078,76 @@ class MiniCPMModel(Model): self.gguf_writer.add_name("MiniCPM") self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) - self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) def set_vocab(self): self._set_vocab_hf() + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def write_tensors(self): + block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + n_head = self.hparams.get("num_attention_heads") + n_kv_head = self.hparams.get("num_key_value_heads") + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + + self.gguf_writer.add_tensor(new_name, data) + class QwenModel(Model): @staticmethod diff --git a/llama.cpp b/llama.cpp index f8f5796a4..552e0d02e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2947,6 +2947,8 @@ static void llm_load_hparams( } break; case LLM_ARCH_MINICPM: { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_2B; break; default: model.type = e_model::MODEL_UNKNOWN; From b7b74cef36a93ae01e0b9af8986d131761742d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 8 Feb 2024 11:36:54 +0100 Subject: [PATCH 5/6] fix trailing whitespace (#5407) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 552e0d02e..89acafbc3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8587,7 +8587,7 @@ void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * can // } const int64_t t_start_sample_us = ggml_time_us(); - + if (k <= 0) { k = candidates->size; } From ff4ff05c5ff4311c05a8ce1f984c7d8def4f07a5 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Thu, 8 Feb 2024 15:20:03 +0100 Subject: [PATCH 6/6] llava : add missing .py, and fix paths in README.md (#5414) This commit adds the missing .py extension to the convert-image-encoder-to-gguf script. It also fixes the paths for the `model` and `mmproj` options in the example llava-cli command. Signed-off-by: Daniel Bevenius --- examples/llava/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/llava/README.md b/examples/llava/README.md index 295181a34..721d5e613 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -14,7 +14,7 @@ Build with cmake or run `make llava-cli` to build it. After building, run: `./llava-cli` to see the usage. For example: ```sh -./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg +./llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg ``` **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so. @@ -38,7 +38,7 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b 3. Use `convert-image-encoder-to-gguf.py` to convert the LLaVA image encoder to GGUF: ```sh -python ./examples/llava/convert-image-encoder-to-gguf -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b +python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b ``` 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: