From 0e0590adab9f367b15ae2bf090a6d24f9df47ff1 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 2 Jul 2024 08:39:38 +0200 Subject: [PATCH 01/12] cuda : update supports_op for matrix multiplication (#8245) --- ggml/src/ggml-cuda.cu | 47 ++++++++++++++++++++++++-------------- tests/test-backend-ops.cpp | 1 + 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu index 649ef5a08..1c9ccc8a1 100644 --- a/ggml/src/ggml-cuda.cu +++ b/ggml/src/ggml-cuda.cu @@ -2711,27 +2711,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: { - struct ggml_tensor * a; - struct ggml_tensor * b; + struct ggml_tensor * a = op->src[0]; if (op->op == GGML_OP_MUL_MAT) { - a = op->src[0]; - b = op->src[1]; - } else { - a = op->src[2]; - b = op->src[1]; - } - if (a->ne[3] != b->ne[3]) { - return false; - } - ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { - if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + struct ggml_tensor * b = op->src[1]; + if (a->ne[3] != b->ne[3]) { return false; } } - return true; + switch (a->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } } break; case GGML_OP_GET_ROWS: { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f74c0db47..2bb71ac03 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2052,6 +2052,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + GGML_TYPE_BF16, }; // unary ops From 023b8807e10bc3ade24a255f01c1ad2a01bb4228 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Tue, 2 Jul 2024 08:40:49 +0200 Subject: [PATCH 02/12] convert-hf : print output file name when completed (#8181) * convert-hf : print output file name when completed This commit adds the output file name to the log message when the conversion is completed. The motivation for this change is that when `--outfile` option is not specified it migth not be obvious where the output file is written. With this change the output of running the script will be something like the following: ```console INFO:hf-to-gguf:Model successfully exported to models/gemma-2-9b-it.gguf. ``` Signed-off-by: Daniel Bevenius * squash! convert-hf : print output file name when completed Updates the output of to support printing the directory if the output is split into multiple files. Also the output file name is now retrieved from the model_instance object. Signed-off-by: Daniel Bevenius * squash! convert-hf : print output file name when completed Use parent attribute of Path object and string interpolation. Signed-off-by: Daniel Bevenius * squash! convert-hf : print output file name when completed Use os.sep instead of hardcoding the path separator. Signed-off-by: Daniel Bevenius --------- Signed-off-by: Daniel Bevenius --- convert-hf-to-gguf.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6833e9437..05fd70171 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -3120,7 +3120,8 @@ def main() -> None: "auto": gguf.LlamaFileType.GUESSED, } - if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"): + is_split = args.split_max_tensors > 0 or args.split_max_size != "0" + if args.use_temp_file and is_split: logger.error("Error: Cannot use temp file when splitting") sys.exit(1) @@ -3157,11 +3158,12 @@ def main() -> None: if args.vocab_only: logger.info("Exporting model vocab...") model_instance.write_vocab() - logger.info("Model vocab successfully exported.") + logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") else: logger.info("Exporting model...") model_instance.write() - logger.info("Model successfully exported.") + out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out + logger.info(f"Model successfully exported to {out_path}") if __name__ == '__main__': From 968967376dc2c018d29f897c4883d335bbf384fb Mon Sep 17 00:00:00 2001 From: Faisal Zaghloul Date: Tue, 2 Jul 2024 10:36:00 -0400 Subject: [PATCH 03/12] Add `JAIS` model(s) (#8118) * Add `JAIS` model(s) * cleanup * address review comments * remove hack * un-hardcode max-alibi-bias * minor tweaks --------- Co-authored-by: fmz --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 93 ++++++++++++++++++ gguf-py/gguf/constants.py | 14 +++ gguf-py/gguf/tensor_mapping.py | 19 ++-- include/llama.h | 1 + src/llama.cpp | 169 +++++++++++++++++++++++++++++++++ 6 files changed, 288 insertions(+), 9 deletions(-) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 2758214fa..944e9d15a 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -86,6 +86,7 @@ models = [ {"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", }, {"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", }, {"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B + {"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", }, ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 05fd70171..6add27cbb 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -490,6 +490,9 @@ class Model: if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee": # ref: https://huggingface.co/LumiOpen/Viking-7B res = "viking" + if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901": + # ref: https://huggingface.co/core42/jais-13b + res = "jais" if res is None: logger.warning("\n") @@ -2965,6 +2968,96 @@ class T5Model(Model): return [(self.map_tensor_name(name), data_torch)] +@Model.register("JAISLMHeadModel") +class JaisModel(Model): + model_arch = gguf.MODEL_ARCH.JAIS + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # SwigLU activation + assert self.hparams["activation_function"] == "swiglu" + # ALiBi position embedding + assert self.hparams["position_embedding_type"] == "alibi" + + # Embeddings scale + self.embeddings_scale = 1.0 + # note: For some JAIS flavors, output is tied to (same as) wte in original model + self.output_is_wte = False + if 'mup_embeddings_scale' in self.hparams: + self.output_is_wte = True # Hack (?) + self.embeddings_scale = self.hparams['mup_embeddings_scale'] + elif 'embeddings_scale' in self.hparams: + self.embeddings_scale = self.hparams['embeddings_scale'] + else: + assert False + + self.width_scale = 1.0 + if 'mup_output_alpha' in self.hparams: + assert 'mup_width_scale' in self.hparams + self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale'] + elif 'width_scale' in self.hparams: + self.width_scale = self.hparams['width_scale'] + else: + assert False + + self.max_alibi_bias = 8.0 + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_block_count(self.hparams["n_layer"]) + self.gguf_writer.add_context_length(self.hparams["n_positions"]) + self.gguf_writer.add_embedding_length(self.hparams["n_embd"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"]) + self.gguf_writer.add_head_count(self.hparams["n_head"]) + self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) + self.gguf_writer.add_file_type(self.ftype) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + tensors: list[tuple[str, Tensor]] = [] + + # we don't need these + if name.endswith((".attn.bias")): + return tensors + + if name.endswith(("relative_pe.slopes")): + # Calculate max ALiBi bias (this is the inverse of the ALiBi calculation) + # Some other models has max_alibi_bias spelled out explicitly in the hyperparams, + # but Jais's PyTorch model simply precalculates the slope values and places them + # in relative_pes.slopes + n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"])) + first_val = float(data_torch._data[0]) + self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2) + + return tensors + + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")): + data_torch = data_torch.transpose(1, 0) + + new_name = self.map_tensor_name(name) + + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((new_name, data_torch * self.embeddings_scale)) + if self.output_is_wte: + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale)) + elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): + assert not self.output_is_wte + tensors.append((new_name, data_torch * self.width_scale)) + else: + tensors.append((new_name, data_torch)) + + return tensors + + def write_tensors(self): + super().write_tensors() + self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias) + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e87c58266..419f10cee 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -164,6 +164,7 @@ class MODEL_ARCH(IntEnum): DEEPSEEK2 = auto() BITNET = auto() T5 = auto() + JAIS = auto() class MODEL_TENSOR(IntEnum): @@ -288,6 +289,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", + MODEL_ARCH.JAIS: "jais", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -954,6 +956,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ENC_FFN_UP, MODEL_TENSOR.ENC_OUTPUT_NORM, ], + MODEL_ARCH.JAIS: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0bed43939..20e28423b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx jais "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -49,7 +49,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -58,7 +58,7 @@ class TensorNameMap: # Output norm MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox - "transformer.ln_f", # gpt2 gpt-j falcon + "transformer.ln_f", # gpt2 gpt-j falcon jais "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "transformer.norm_f", # mpt dbrx @@ -81,7 +81,7 @@ class TensorNameMap: # Attention norm MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox - "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen jais "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b "h.{bid}.input_layernorm", # bloom @@ -109,7 +109,7 @@ class TensorNameMap: # Attention query-key-value MODEL_TENSOR.ATTN_QKV: ( "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox - "transformer.h.{bid}.attn.c_attn", # gpt2 qwen + "transformer.h.{bid}.attn.c_attn", # gpt2 qwen jais "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon @@ -160,7 +160,7 @@ class TensorNameMap: # Attention output MODEL_TENSOR.ATTN_OUT: ( "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom @@ -202,7 +202,7 @@ class TensorNameMap: # Feed-forward norm MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox - "transformer.h.{bid}.ln_2", # gpt2 refact qwen + "transformer.h.{bid}.ln_2", # gpt2 refact qwen jais "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf @@ -239,7 +239,7 @@ class TensorNameMap: # Feed-forward up MODEL_TENSOR.FFN_UP: ( "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox - "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.h.{bid}.mlp.c_fc", # gpt2 jais "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon "h.{bid}.mlp.dense_h_to_4h", # bloom @@ -285,6 +285,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.gate_proj", # llama-hf refact "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais "model.layers.layers.{bid}.mlp.gate_proj", # plamo "model.layers.{bid}.feed_forward.w1", # internlm2 "encoder.layers.{bid}.mlp.fc12", # nomic-bert @@ -308,7 +309,7 @@ class TensorNameMap: # Feed-forward down MODEL_TENSOR.FFN_DOWN: ( "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox - "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen + "transformer.h.{bid}.mlp.c_proj", # gpt2 refact qwen jais "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon "h.{bid}.mlp.dense_4h_to_h", # bloom diff --git a/include/llama.h b/include/llama.h index cafeafb85..c5b618292 100644 --- a/include/llama.h +++ b/include/llama.h @@ -89,6 +89,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_VIKING = 16, + LLAMA_VOCAB_PRE_TYPE_JAIS = 17, }; // note: these values should be synchronized with ggml_rope diff --git a/src/llama.cpp b/src/llama.cpp index eea532f6a..73f52435a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -228,6 +228,7 @@ enum llm_arch { LLM_ARCH_DEEPSEEK2, LLM_ARCH_BITNET, LLM_ARCH_T5, + LLM_ARCH_JAIS, LLM_ARCH_UNKNOWN, }; @@ -269,6 +270,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_JAIS, "jais" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1236,6 +1238,21 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_JAIS, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2035,6 +2052,7 @@ enum e_model { MODEL_410M, MODEL_0_5B, MODEL_1B, + MODEL_1_3B, MODEL_1_4B, MODEL_2B, MODEL_2_8B, @@ -4276,6 +4294,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_410M: return "410M"; case MODEL_0_5B: return "0.5B"; case MODEL_1B: return "1B"; + case MODEL_1_3B: return "1.3B"; case MODEL_1_4B: return "1.4B"; case MODEL_2B: return "2B"; case MODEL_2_8B: return "2.8B"; @@ -4898,6 +4917,18 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_JAIS: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); + + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1_3B; break; + case 40: model.type = e_model::MODEL_13B; break; + /* TODO: add variants */ + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -5129,6 +5160,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "viking") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_VIKING; + } else if ( + tokenizer_pre == "jais") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -6962,6 +6996,44 @@ static bool llm_load_tensors( layer.ffn_up_scale = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "scale", i), {1}); } } break; + case LLM_ARCH_JAIS: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // Output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_gate_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -12354,6 +12426,97 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_jais() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*cur->nb[0]*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*cur->nb[0]*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/float(n_embd_head), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -12585,6 +12748,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_bitnet(); } break; + case LLM_ARCH_JAIS: + { + result = llm.build_jais(); + } break; default: GGML_ASSERT(false); } @@ -13947,6 +14114,7 @@ struct llm_tokenizer_bpe { break; case LLAMA_VOCAB_PRE_TYPE_GPT2: case LLAMA_VOCAB_PRE_TYPE_OLMO: + case LLAMA_VOCAB_PRE_TYPE_JAIS: regex_exprs = { "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", }; @@ -17826,6 +17994,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MAMBA: case LLM_ARCH_JINA_BERT_V2: case LLM_ARCH_T5: + case LLM_ARCH_JAIS: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values From 07a3fc0608a68c0c93a5fbfa9c58f4c9ec64cb81 Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Tue, 2 Jul 2024 12:18:10 -0400 Subject: [PATCH 04/12] Removes multiple newlines at the end of files that is breaking the editorconfig step of CI. (#8258) --- .github/ISSUE_TEMPLATE/config.yml | 2 -- common/common.h | 1 - examples/embedding/README.md | 1 - examples/infill/infill.cpp | 1 - examples/lookup/README.md | 1 - examples/main-cmake-pkg/.gitignore | 1 - examples/main-cmake-pkg/CMakeLists.txt | 1 - examples/server-embd.py | 1 - examples/server/tests/features/passkey.feature | 1 - examples/server/themes/buttons-top/index.html | 1 - examples/server/themes/wild/index.html | 1 - examples/sycl/run-llama2.sh | 1 - examples/sycl/win-build-sycl.bat | 1 - examples/sycl/win-run-llama2.bat | 2 -- ggml/include/ggml-metal.h | 1 - ggml/src/ggml-cuda/cpy.cu | 1 - ggml/src/ggml-metal.metal | 1 - ggml/src/ggml-quants.h | 1 - ggml/src/ggml-vulkan-shaders.hpp | 1 - scripts/pod-llama.sh | 1 - src/unicode-data.cpp | 1 - tests/test-rope.cpp | 1 - 22 files changed, 24 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index c88134dbb..eb8c4b472 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,5 +9,3 @@ contact_links: - name: Want to contribute? url: https://github.com/ggerganov/llama.cpp/wiki/contribute about: Head to the contribution guide page of the wiki for areas you can help with - - diff --git a/common/common.h b/common/common.h index 627b7ed85..65c0ef81a 100644 --- a/common/common.h +++ b/common/common.h @@ -459,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha void yaml_dump_non_result_info( FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc); - diff --git a/examples/embedding/README.md b/examples/embedding/README.md index 86df18958..e3705b454 100644 --- a/examples/embedding/README.md +++ b/examples/embedding/README.md @@ -58,4 +58,3 @@ The above command will output space-separated float values. ```powershell embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null ``` - diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index ca71dd687..0e682154d 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -659,4 +659,3 @@ int main(int argc, char ** argv) { return 0; } - diff --git a/examples/lookup/README.md b/examples/lookup/README.md index 5bfb0de93..71c345c03 100644 --- a/examples/lookup/README.md +++ b/examples/lookup/README.md @@ -10,4 +10,3 @@ More info: https://github.com/ggerganov/llama.cpp/pull/4484 https://github.com/ggerganov/llama.cpp/issues/4226 - diff --git a/examples/main-cmake-pkg/.gitignore b/examples/main-cmake-pkg/.gitignore index e32c11c7f..67c01d64c 100644 --- a/examples/main-cmake-pkg/.gitignore +++ b/examples/main-cmake-pkg/.gitignore @@ -48,4 +48,3 @@ build*/ out/ tmp/ - diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index a97ded365..3b38db292 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -30,4 +30,3 @@ target_include_directories(${TARGET} PRIVATE ${_common_path}) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) - diff --git a/examples/server-embd.py b/examples/server-embd.py index 118e04271..a9a36a44c 100644 --- a/examples/server-embd.py +++ b/examples/server-embd.py @@ -31,4 +31,3 @@ for i in range(n-1): embedding2 = np.array(result[j]) similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) print(f"Similarity between {i} and {j}: {similarity:.2f}") - diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature index 1bde7aab8..6a5a84e6a 100644 --- a/examples/server/tests/features/passkey.feature +++ b/examples/server/tests/features/passkey.feature @@ -52,4 +52,3 @@ Feature: Passkey / Self-extend with context shift #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 # 987 | - diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 6af30d307..8334bcde5 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -1054,4 +1054,3 @@ - diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index 772e716cd..8361c5774 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -1058,4 +1058,3 @@ - diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index da0e4aaba..111366fb0 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -34,4 +34,3 @@ fi #use multiple GPUs with same max compute units #ZES_ENABLE_SYSMAN=1 ./build/bin/llama-cli -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 - diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index cdae5a528..17dd1ff5c 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -31,4 +31,3 @@ exit /B 0 :ERROR echo comomand error: %errorlevel% exit /B %errorlevel% - diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index 1d4d7d2cd..f0385cdf0 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -7,5 +7,3 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0 - - diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h index e7543ae79..6c3226c37 100644 --- a/ggml/include/ggml-metal.h +++ b/ggml/include/ggml-metal.h @@ -63,4 +63,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend); #ifdef __cplusplus } #endif - diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 12d741f01..3db57034b 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -487,4 +487,3 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { GGML_ASSERT(false); } } - diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal index e2796fd60..c3503479b 100644 --- a/ggml/src/ggml-metal.metal +++ b/ggml/src/ggml-metal.metal @@ -6537,4 +6537,3 @@ template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; - diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index 4d436a8f0..30983b872 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -130,4 +130,3 @@ void iq3xs_free_impl(int grid_size); #ifdef __cplusplus } #endif - diff --git a/ggml/src/ggml-vulkan-shaders.hpp b/ggml/src/ggml-vulkan-shaders.hpp index 01ff66f71..f0c4c6baf 100644 --- a/ggml/src/ggml-vulkan-shaders.hpp +++ b/ggml/src/ggml-vulkan-shaders.hpp @@ -144954,4 +144954,3 @@ unsigned char sum_rows_f32_data[] = { }; const uint64_t sum_rows_f32_len = 2112; - diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 586d6ea18..0d6d4032d 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -210,4 +210,3 @@ fi # more benches #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 #GGML_CUDA=1 make -j && ./llama-batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 - diff --git a/src/unicode-data.cpp b/src/unicode-data.cpp index 4a939898b..02bdf7823 100644 --- a/src/unicode-data.cpp +++ b/src/unicode-data.cpp @@ -7030,4 +7030,3 @@ const std::vector unicode_ranges_nfd = { // start, last, nfd {0x02FA1C, 0x02FA1C, 0x009F3B}, {0x02FA1D, 0x02FA1D, 0x02A600}, }; - diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp index f0895ffaa..8159e276a 100644 --- a/tests/test-rope.cpp +++ b/tests/test-rope.cpp @@ -218,4 +218,3 @@ int main(int /*argc*/, const char ** /*argv*/) { return 0; } - From 3e2618bc7bf9e9fbf58c32cc3c8dd7d5df1de27e Mon Sep 17 00:00:00 2001 From: Clint Herron Date: Tue, 2 Jul 2024 13:19:56 -0400 Subject: [PATCH 05/12] Adding step to `clean` target to remove legacy binary names to reduce upgrade / migration confusion arising from #7809. (#8257) --- Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile b/Makefile index 8ae4f1dc4..2730d8b60 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,11 @@ TEST_TARGETS = \ tests/test-tokenizer-1-bpe \ tests/test-tokenizer-1-spm +# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned +LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \ + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm + # Deprecation aliases ifdef LLAMA_CUBLAS $(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.) @@ -1086,6 +1091,7 @@ clean: rm -vrf ggml/src/ggml-cuda/template-instances/*.o rm -rvf $(BUILD_TARGETS) rm -rvf $(TEST_TARGETS) + rm -rvf $(LEGACY_TARGETS) find examples pocs -type f -name "*.o" -delete # From a27152b602b369e76f85b7cb7b872a321b7218f7 Mon Sep 17 00:00:00 2001 From: MistApproach <98988043+MistApproach@users.noreply.github.com> Date: Tue, 2 Jul 2024 22:56:46 +0200 Subject: [PATCH 06/12] fix: add missing short command line argument -mli for multiline-input (#8261) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 5a0d0ee03..2c05a4d4a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.cache_type_v = argv[++i]; return true; } - if (arg == "--multiline-input") { + if (arg == "-mli" || arg == "--multiline-input") { params.multiline_input = true; return true; } From fadde6713506d9e6c124f5680ab8c7abebe31837 Mon Sep 17 00:00:00 2001 From: AidanBeltonS Date: Wed, 3 Jul 2024 02:55:34 +0100 Subject: [PATCH 07/12] Dequant improvements rebase (#8255) * Single load for half2 * Store scales in local mem * Vec load quantized values --- ggml/src/ggml-sycl/common.hpp | 6 ++++++ ggml/src/ggml-sycl/convert.cpp | 7 +++++-- ggml/src/ggml-sycl/dequantize.hpp | 30 +++++++++++++++++++----------- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index dfd4a7c2c..476d847ca 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -351,4 +351,10 @@ static __dpct_inline__ float warp_reduce_max(float x, return x; } +// Helper for vec loading aligned data +template +inline sycl::vec vec_aligned_load(const Tp* aligned_ptr) { + return *reinterpret_cast*>(aligned_ptr); +} + #endif // GGML_SYCL_COMMON_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index ce9de2b42..a15271b51 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -152,12 +152,15 @@ static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k, dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor scale_local_acc(sycl::range<1>(12), cgh); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { - dequantize_block_q4_K(vx, y, item_ct1); + dequantize_block_q4_K(vx, y, scale_local_acc.get_pointer(), item_ct1); }); + }); } } diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index b6080d83a..ed8ad098b 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -293,7 +293,8 @@ static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restri #if QK_K == 256 static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; + d = q[j] & 63; + m = q[j + 4] & 63; } else { d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); @@ -303,7 +304,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8 template static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1) { + uint8_t* scales_local, const sycl::nd_item<3> &item_ct1) { const block_q4_K * x = (const block_q4_K *) vx; const int i = item_ct1.get_group(2); @@ -318,19 +319,26 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri dst_t * y = yy + i*QK_K + 64*il + n*ir; - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; + const sycl::half2 dm = x[i].dm; + const float dall = dm[0]; + const float dmin = dm[1]; - const uint8_t * q = x[i].qs + 32*il + n*ir; + if (tid < 12) + scales_local[tid] = x[i].scales[tid]; + item_ct1.barrier(sycl::access::fence_space::local_space); uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; + get_scale_min_k4(is + 0, scales_local, sc, m); + const float d1 = dall * sc; + const float m1 = dmin * m; + get_scale_min_k4(is + 1, scales_local, sc, m); + const float d2 = dall * sc; + const float m2 = dmin * m; + + sycl::vec q_vec = vec_aligned_load(x[i].qs + 32*il + n*ir); for (int l = 0; l < n; ++l) { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l +32] = d2 * (q[l] >> 4) - m2; + y[l + 0] = d1 * (q_vec[l] & 0xF) - m1; + y[l +32] = d2 * (q_vec[l] >> 4) - m2; } #else const int tid = item_ct1.get_local_id(2); From f8d6a23804f3798ff2869da68c1223b618df09ec Mon Sep 17 00:00:00 2001 From: Judd Date: Wed, 3 Jul 2024 20:40:16 +0800 Subject: [PATCH 08/12] fix typo (#8267) Co-authored-by: Judd --- ggml/src/ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f5502afbe..bc91ac3a7 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -5312,7 +5312,7 @@ void ggml_mul_mat_set_prec( as -> [cols, rows, n_expert] ids -> [n_experts_used, n_tokens] (i32) b -> [cols, n_expert_used, n_tokens] - c -> [cols, n_expert_used, n_tokens] + c -> [rows, n_expert_used, n_tokens] in b, n_experts_used can be broadcasted to match the n_expert_used of ids From 916248af1f3c16abd7408de848e025da095c621c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 3 Jul 2024 16:01:54 +0200 Subject: [PATCH 09/12] fix phi 3 conversion (#8262) --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6add27cbb..d01aed224 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1942,7 +1942,7 @@ class Phi3MiniModel(Model): if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') - if rope_scaling_type == 'su': + if rope_scaling_type == 'su' or rope_scaling_type == 'longrope': attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0 elif rope_scaling_type == 'yarn': attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0 From 5f2d4e60e202aabee10051e6615bb821e51787be Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 3 Jul 2024 19:33:31 +0200 Subject: [PATCH 10/12] ppl : fix n_seq_max for perplexity (#8277) * ppl : fix n_seq_max for perplexity * use 1 seq for kl_divergence --- examples/perplexity/perplexity.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index efde8dfdf..dbe445391 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1991,6 +1991,12 @@ int main(int argc, char ** argv) { params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); + if (params.kl_divergence) { + params.n_parallel = 1; + } else { + // ensure there's at least enough seq_ids for HellaSwag + params.n_parallel = std::max(4, params.n_parallel); + } } if (params.ppl_stride > 0) { @@ -2015,9 +2021,6 @@ int main(int argc, char ** argv) { llama_model * model; llama_context * ctx; - // ensure there's at least enough seq_ids for HellaSwag - params.n_parallel = std::max(4, params.n_parallel); - // load the model and apply lora adapter, if any std::tie(model, ctx) = llama_init_from_gpt_params(params); if (model == NULL) { From d23287f122c34ebef368742116d53a0ccb2041ee Mon Sep 17 00:00:00 2001 From: Daniele <57776841+daniandtheweb@users.noreply.github.com> Date: Wed, 3 Jul 2024 23:02:58 +0000 Subject: [PATCH 11/12] Define and optimize RDNA1 (#8085) --- ggml/src/ggml-cuda/common.cuh | 4 ++++ ggml/src/ggml-cuda/mmq.cuh | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 472f4ace1..4ff06b871 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -227,6 +227,10 @@ typedef float2 dfloat2; #define RDNA2 #endif +#if defined(__gfx1010__) || defined(__gfx1012__) +#define RDNA1 +#endif + #ifndef __has_builtin #define __has_builtin(x) 0 #endif diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index 1396e7a75..deaed066f 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -60,12 +60,16 @@ static constexpr __device__ int get_mmq_x_max_device() { } static constexpr int get_mmq_y_host(const int cc) { - return int8_mma_available(cc) || cc >= CC_VOLTA ? 128 : 64; + return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= CC_VOLTA ? 128 : 64); } static constexpr __device__ int get_mmq_y_device() { #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA1) + return 64; +#else return 128; +#endif // defined RDNA1 #else #if __CUDA_ARCH__ >= CC_VOLTA return 128; @@ -2259,9 +2263,9 @@ static __device__ void mul_mat_q_process_tile( template #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) +#if defined(RDNA3) || defined(RDNA2) || defined(RDNA1) __launch_bounds__(WARP_SIZE*nwarps, 2) -#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(RDNA3) || defined(RDNA2) || defined(RDNA1) #else #if __CUDA_ARCH__ >= CC_VOLTA __launch_bounds__(WARP_SIZE*nwarps, 1) From f619024764e72261f14d7c31d892b8fb976603b4 Mon Sep 17 00:00:00 2001 From: AidanBeltonS Date: Thu, 4 Jul 2024 02:07:19 +0100 Subject: [PATCH 12/12] [SYCL] Remove unneeded semicolons (#8280) --- ggml/src/ggml-sycl/dpct/helper.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 1ff297218..5e98660dc 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -255,7 +255,7 @@ namespace dpct void set_pitch(size_t pitch) { _pitch = pitch; } size_t get_x() { return _x; } - void set_x(size_t x) { _x = x; }; + void set_x(size_t x) { _x = x; } size_t get_y() { return _y; } void set_y(size_t y) { _y = y; } @@ -1056,7 +1056,7 @@ namespace dpct #error "Only support Windows and Linux." #endif next_free = mapped_address_space; - }; + } public: using buffer_id_t = int; @@ -1077,7 +1077,7 @@ namespace dpct #else #error "Only support Windows and Linux." #endif - }; + } mem_mgr(const mem_mgr &) = delete; mem_mgr &operator=(const mem_mgr &) = delete;