diff --git a/README.md b/README.md index 34992b0af..1283f6805 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) - [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) +- [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp index a15bc8aa2..81783b7b1 100644 --- a/ggml/src/ggml-cann.cpp +++ b/ggml/src/ggml-cann.cpp @@ -1670,10 +1670,6 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend, // TODO: fix me // Current groupsize should not be greater than k-1 in // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(). - if (op->src[0]->ne[0]-1 > QK8_0) { - return true; - } - return false; case GGML_TYPE_Q4_0: return true; default: diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp index f6deee3c5..9c8c86b66 100644 --- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 { __aicore__ inline void init(GM_ADDR input, GM_ADDR output, int64_t *input_ne_ub, size_t *input_nb_ub, int64_t *output_ne_ub) { + // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], + // permute=[0,0,0,0]): + // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL int64_t op_block_num = GetBlockNum(); int64_t op_block_idx = GetBlockIdx(); @@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 { pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); pipe.InitBuffer(output_queue, BUFFER_NUM, Group_Size * sizeof(int8_t) / 2); - pipe.InitBuffer(cast_queue , BUFFER_NUM, Group_Size * sizeof(float)); - pipe.InitBuffer(work_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(max_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(min_queue, BUFFER_NUM, Group_Size*sizeof(float)); - pipe.InitBuffer(scale_queue, BUFFER_NUM, 16*sizeof(half)); - pipe.InitBuffer(int8_queue, BUFFER_NUM, Group_Size * sizeof(int8_t)); - pipe.InitBuffer(half_queue, BUFFER_NUM, Group_Size * sizeof(half)); + pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); + pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); + pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); + pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); } __aicore__ inline void copy_in(uint32_t offset) { @@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 { for (int64_t j = 0; j < group_size_in_row; j++) { half scale = calculate_group(i, j); scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { + // Copy Group_Size/2 length data each time. + if (scale_local_offset == Group_Size / 2) { scale_local_offset = 0; // TODO: OPTIMIZE ME pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); + DataCopy(scale_gm[scale_global_offset], scale_local, + Group_Size / 2); pipe_barrier(PIPE_ALL); - scale_global_offset += 16; + scale_global_offset += Group_Size / 2; } } } diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 42f4a34b8..910981e4a 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2312,7 +2312,7 @@ inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); } -inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; } +inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); } inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e343c2ef1..59ffd92ea 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -161,6 +161,7 @@ class Keys: SUFFIX_ID = "tokenizer.ggml.suffix_token_id" MIDDLE_ID = "tokenizer.ggml.middle_token_id" EOT_ID = "tokenizer.ggml.eot_token_id" + EOM_ID = "tokenizer.ggml.eom_token_id" class Adapter: TYPE = "adapter.type" @@ -1327,3 +1328,4 @@ KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID +KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2e0b335ee..76385a828 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -828,6 +828,9 @@ class GGUFWriter: def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def add_eom_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOM_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py index 15189f717..ea4d02705 100644 --- a/gguf-py/gguf/metadata.py +++ b/gguf-py/gguf/metadata.py @@ -284,20 +284,67 @@ class Metadata: ######################## if model_card is not None: - if "model_name" in model_card and metadata.name is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.name = model_card.get("model_name") + def use_model_card_metadata(metadata_key: str, model_card_key: str): + if model_card_key in model_card and getattr(metadata, metadata_key, None) is None: + setattr(metadata, metadata_key, model_card.get(model_card_key)) - if "model_creator" in model_card and metadata.author is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.author = model_card.get("model_creator") + def use_array_model_card_metadata(metadata_key: str, model_card_key: str): + # Note: Will append rather than replace if already exist + tags_value = model_card.get(model_card_key, None) + if tags_value is None: + return - if "model_type" in model_card and metadata.basename is None: - # Not part of huggingface model card standard but notice some model creator using it - # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' - metadata.basename = model_card.get("model_type") + current_value = getattr(metadata, metadata_key, None) + if current_value is None: + current_value = [] + + if isinstance(tags_value, str): + current_value.append(tags_value) + elif isinstance(tags_value, list): + current_value.extend(tags_value) + + setattr(metadata, metadata_key, current_value) + + # LLAMA.cpp's direct internal convention + # (Definitely not part of hugging face formal/informal standard) + ######################################### + use_model_card_metadata("name", "name") + use_model_card_metadata("author", "author") + use_model_card_metadata("version", "version") + use_model_card_metadata("organization", "organization") + use_model_card_metadata("description", "description") + use_model_card_metadata("finetune", "finetune") + use_model_card_metadata("basename", "basename") + use_model_card_metadata("size_label", "size_label") + use_model_card_metadata("source_url", "url") + use_model_card_metadata("source_doi", "doi") + use_model_card_metadata("source_uuid", "uuid") + use_model_card_metadata("source_repo_url", "repo_url") + + # LLAMA.cpp's huggingface style convention + # (Definitely not part of hugging face formal/informal standard... but with model_ appended to match their style) + ########################################### + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_author") + use_model_card_metadata("version", "model_version") + use_model_card_metadata("organization", "model_organization") + use_model_card_metadata("description", "model_description") + use_model_card_metadata("finetune", "model_finetune") + use_model_card_metadata("basename", "model_basename") + use_model_card_metadata("size_label", "model_size_label") + use_model_card_metadata("source_url", "model_url") + use_model_card_metadata("source_doi", "model_doi") + use_model_card_metadata("source_uuid", "model_uuid") + use_model_card_metadata("source_repo_url", "model_repo_url") + + # Hugging Face Direct Convention + ################################# + + # Not part of huggingface model card standard but notice some model creator using it + # such as TheBloke in 'TheBloke/Mistral-7B-Instruct-v0.2-GGUF' + use_model_card_metadata("name", "model_name") + use_model_card_metadata("author", "model_creator") + use_model_card_metadata("basename", "model_type") if "base_model" in model_card: # This represents the parent models that this is based on @@ -329,58 +376,18 @@ class Metadata: base_model["repo_url"] = f"https://huggingface.co/{org_component}/{model_full_name_component}" metadata.base_models.append(base_model) - if "license" in model_card and metadata.license is None: - metadata.license = model_card.get("license") + use_model_card_metadata("license", "license") + use_model_card_metadata("license_name", "license_name") + use_model_card_metadata("license_link", "license_link") - if "license_name" in model_card and metadata.license_name is None: - metadata.license_name = model_card.get("license_name") + use_array_model_card_metadata("tags", "tags") + use_array_model_card_metadata("tags", "pipeline_tag") - if "license_link" in model_card and metadata.license_link is None: - metadata.license_link = model_card.get("license_link") + use_array_model_card_metadata("languages", "languages") + use_array_model_card_metadata("languages", "language") - tags_value = model_card.get("tags", None) - if tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(tags_value, str): - metadata.tags.append(tags_value) - elif isinstance(tags_value, list): - metadata.tags.extend(tags_value) - - pipeline_tags_value = model_card.get("pipeline_tag", None) - if pipeline_tags_value is not None: - - if metadata.tags is None: - metadata.tags = [] - - if isinstance(pipeline_tags_value, str): - metadata.tags.append(pipeline_tags_value) - elif isinstance(pipeline_tags_value, list): - metadata.tags.extend(pipeline_tags_value) - - language_value = model_card.get("languages", model_card.get("language", None)) - if language_value is not None: - - if metadata.languages is None: - metadata.languages = [] - - if isinstance(language_value, str): - metadata.languages.append(language_value) - elif isinstance(language_value, list): - metadata.languages.extend(language_value) - - dataset_value = model_card.get("datasets", model_card.get("dataset", None)) - if dataset_value is not None: - - if metadata.datasets is None: - metadata.datasets = [] - - if isinstance(dataset_value, str): - metadata.datasets.append(dataset_value) - elif isinstance(dataset_value, list): - metadata.datasets.extend(dataset_value) + use_array_model_card_metadata("datasets", "datasets") + use_array_model_card_metadata("datasets", "dataset") # Hugging Face Parameter Heuristics #################################### diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 133094904..9be076f6d 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1444,7 +1444,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) { return token != -1 && ( token == llama_token_eos_impl(vocab) || - token == llama_token_eot_impl(vocab) + token == llama_token_eot_impl(vocab) || + token == llama_token_eom_impl(vocab) ); } @@ -1500,6 +1501,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) { return vocab.special_eot_id; } +llama_token llama_token_eom_impl(const struct llama_vocab & vocab) { + return vocab.special_eom_id; +} + int32_t llama_tokenize_impl( const struct llama_vocab & vocab, const char * text, diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 30b565d55..7adfc16da 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -45,6 +45,7 @@ struct llama_vocab { id special_suffix_id = -1; id special_middle_id = -1; id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token + id special_eom_id = -1; // tokenizer flags bool tokenizer_add_space_prefix = false; @@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab); llama_token llama_token_middle_impl(const struct llama_vocab & vocab); llama_token llama_token_suffix_impl(const struct llama_vocab & vocab); llama_token llama_token_eot_impl (const struct llama_vocab & vocab); +llama_token llama_token_eom_impl (const struct llama_vocab & vocab); int32_t llama_tokenize_impl( const struct llama_vocab & vocab, diff --git a/src/llama.cpp b/src/llama.cpp index ff234565d..a7b1c9ebd 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -359,6 +359,7 @@ enum llm_kv { LLM_KV_TOKENIZER_SUFFIX_ID, LLM_KV_TOKENIZER_MIDDLE_ID, LLM_KV_TOKENIZER_EOT_ID, + LLM_KV_TOKENIZER_EOM_ID, LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, @@ -456,6 +457,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -5583,6 +5585,7 @@ static void llm_load_vocab( { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, + { LLM_KV_TOKENIZER_EOM_ID, vocab.special_eom_id }, }; for (const auto & it : special_token_types) { @@ -5635,6 +5638,17 @@ static void llm_load_vocab( } } } + + // find EOM token: "<|eom_id|>" + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOM_ID + // for now, we apply this workaround to find the EOM token based on its text + if (vocab.special_eom_id == -1) { + const auto & t = vocab.token_to_id.find("<|eom_id|>"); + if (t != vocab.token_to_id.end()) { + vocab.special_eom_id = t->second; + } + } } // build special tokens cache