From 9146d36fe7e3e911a07438c07efc1bae082f6390 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Sun, 26 May 2024 15:09:42 +0300 Subject: [PATCH 01/13] Readme: add akx/ggify to tools (#1484) --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 2ee267fdf..15519c97f 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing: *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* +**Tools:** + +- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML + --- Here is a typical run using LLaMA v2 13B on M2 Ultra: From c429b33beb35f13934a4dfbe0c138d30b45e5d54 Mon Sep 17 00:00:00 2001 From: Bartowski Date: Sun, 26 May 2024 08:28:35 -0400 Subject: [PATCH 02/13] llama : add Smaug 70B support (#7402) --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 3 +++ llama.cpp | 4 ++++ llama.h | 1 + 4 files changed, 9 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 1923b88ba..84b72348d 100755 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -81,6 +81,7 @@ models = [ {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, + {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", }, ] diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 51549ac72..bfccf8623 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -473,6 +473,9 @@ class Model: if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de res = "jina-v2-de" + if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d": + # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct + res = "smaug-bpe" if res is None: logger.warning("\n") diff --git a/llama.cpp b/llama.cpp index 989d27b9d..f67cb7e23 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4593,6 +4593,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "dbrx") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; + } else if ( + tokenizer_pre == "smaug-bpe") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12512,6 +12515,7 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_DBRX: + case LLAMA_VOCAB_PRE_TYPE_SMAUG: word_collection = unicode_regex_split(text, { // same as llama3 "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", diff --git a/llama.h b/llama.h index 16676269d..7671b8a57 100644 --- a/llama.h +++ b/llama.h @@ -85,6 +85,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11, LLAMA_VOCAB_PRE_TYPE_OLMO = 12, LLAMA_VOCAB_PRE_TYPE_DBRX = 13, + LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, }; // note: these values should be synchronized with ggml_rope From 32a28217f475119926c603341e8273b26932b56a Mon Sep 17 00:00:00 2001 From: Galunid Date: Sun, 26 May 2024 16:02:34 +0200 Subject: [PATCH 03/13] Fix aya-23 conversion scripts (#7539) --- convert-hf-to-gguf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index bfccf8623..a342f6b1c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2395,7 +2395,8 @@ class CommandR2Model(Model): # max_position_embeddings = 8192 in config.json but model was actually # trained on 128k context length - self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + # aya-23 models don't have model_max_length specified + self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"]) def set_gguf_parameters(self): super().set_gguf_parameters() From d298382ad977ec89c8de7b57459b9d7965d2c272 Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 27 May 2024 00:10:17 +1000 Subject: [PATCH 04/13] main: replace --no-special with --special (#7534) This also flips the default behavior of the output to not include control token by default. --- common/common.cpp | 6 +++--- common/common.h | 2 +- examples/main/main.cpp | 10 ++-------- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 781f2166b..65103c3c2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -904,8 +904,8 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.interactive_specials = true; return true; } - if (arg == "--no-special") { - params.no_special = true; + if (arg == "--special") { + params.special = true; return true; } if (arg == "--embedding") { @@ -1366,9 +1366,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); + printf(" --special special tokens output enabled\n"); printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); - printf(" --no-special control tokens output disabled\n"); printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); diff --git a/common/common.h b/common/common.h index 5388f6b68..264504830 100644 --- a/common/common.h +++ b/common/common.h @@ -146,7 +146,7 @@ struct gpt_params { bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode - bool no_special = false; // disable control token output + bool special = false; // enable special token output bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ac35772f1..44949ba86 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -740,16 +740,10 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id, params.special); // Console/Stream Output - if (!llama_token_is_control(llama_get_model(ctx), id)) { - // Stream Output Token To Standard Output - fprintf(stdout, "%s", token_str.c_str()); - } else if (!params.no_special && !params.conversation) { - // Stream Control Token To Standard Output Stream - fprintf(stdout, "%s", token_str.c_str()); - } + fprintf(stdout, "%s", token_str.c_str()); // Record Displayed Tokens To Log // Note: Generated tokens are created one by one hence this check From dff451cfa1f297348751ce6b538670e1ae9a7d5b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 26 May 2024 18:54:56 +0300 Subject: [PATCH 05/13] flake.lock: Update (#7540) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/4a6b83b05df1a8bd7d99095ec4b4d271f2956b64?narHash=sha256-%2BNpbZRCRisUHKQJZF3CT%2Bxn14ZZQO%2BKjxIIanH3Pvn4%3D' (2024-05-17) → 'github:NixOS/nixpkgs/bfb7a882678e518398ce9a31a881538679f6f092?narHash=sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8%3D' (2024-05-24) Co-authored-by: github-actions[bot] --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index 451dfd32f..fd6e2a5f6 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1715961556, - "narHash": "sha256-+NpbZRCRisUHKQJZF3CT+xn14ZZQO+KjxIIanH3Pvn4=", + "lastModified": 1716509168, + "narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "4a6b83b05df1a8bd7d99095ec4b4d271f2956b64", + "rev": "bfb7a882678e518398ce9a31a881538679f6f092", "type": "github" }, "original": { From d6ef0e77dd25f54fb5856af47e3926cf6f36c281 Mon Sep 17 00:00:00 2001 From: Brian Date: Mon, 27 May 2024 10:54:30 +1000 Subject: [PATCH 06/13] github: add self sorted issue ticket forms (#7543) * github: add self sorted issue ticket forms [no ci] * github: consolidate BSD in bug issue ticket * github: remove contact from bug ticket template [no ci] * github: remove bios from os dropdown in bug report [no ci] --- .github/ISSUE_TEMPLATE/01-bug-low.yml | 50 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/02-bug-medium.yml | 50 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/03-bug-high.yml | 50 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/04-bug-critical.yml | 50 +++++++++++++++++++++ .github/ISSUE_TEMPLATE/05-enhancement.yml | 51 ++++++++++++++++++++++ .github/ISSUE_TEMPLATE/06-question.yml | 38 ++++++++++++++++ .github/ISSUE_TEMPLATE/bug.md | 11 ----- .github/ISSUE_TEMPLATE/enhancement.md | 28 ------------ 8 files changed, 289 insertions(+), 39 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/01-bug-low.yml create mode 100644 .github/ISSUE_TEMPLATE/02-bug-medium.yml create mode 100644 .github/ISSUE_TEMPLATE/03-bug-high.yml create mode 100644 .github/ISSUE_TEMPLATE/04-bug-critical.yml create mode 100644 .github/ISSUE_TEMPLATE/05-enhancement.yml create mode 100644 .github/ISSUE_TEMPLATE/06-question.yml delete mode 100644 .github/ISSUE_TEMPLATE/bug.md delete mode 100644 .github/ISSUE_TEMPLATE/enhancement.md diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml new file mode 100644 index 000000000..bfb9d9a06 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml @@ -0,0 +1,50 @@ +name: Low Severity Bugs +description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches) +title: "Bug: " +labels: ["bug-unconfirmed", "low severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./main --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml new file mode 100644 index 000000000..e8297eea0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml @@ -0,0 +1,50 @@ +name: Medium Severity Bug +description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable) +title: "Bug: " +labels: ["bug-unconfirmed", "medium severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./main --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml new file mode 100644 index 000000000..3c9d50d16 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml @@ -0,0 +1,50 @@ +name: High Severity Bug +description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow) +title: "Bug: " +labels: ["bug-unconfirmed", "high severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./main --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml new file mode 100644 index 000000000..d089d5fa1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml @@ -0,0 +1,50 @@ +name: Critical Severity Bug +description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss) +title: "Bug: " +labels: ["bug-unconfirmed", "critical severity"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + Please include information about your system, the steps to reproduce the bug, + and the version of llama.cpp that you are using. + If possible, please provide a minimal code example that reproduces the bug. + - type: textarea + id: what-happened + attributes: + label: What happened? + description: Also tell us, what did you expect to happen? + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: version + attributes: + label: Name and Version + description: Which executable and which version of our software are you running? (use `--version` to get a version string) + placeholder: | + $./main --version + version: 2999 (42b4109e) + built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu + validations: + required: true + - type: dropdown + id: operating-system + attributes: + label: What operating system are you seeing the problem on? + multiple: true + options: + - Linux + - Mac + - Windows + - BSD + - Other? (Please let us know in description) + validations: + required: false + - type: textarea + id: logs + attributes: + label: Relevant log output + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: shell diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml new file mode 100644 index 000000000..7f516abb0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml @@ -0,0 +1,51 @@ +name: Enhancement template +description: Used to request enhancements for llama.cpp +title: "Feature Request: " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas) + + - type: checkboxes + id: prerequisites + attributes: + label: Prerequisites + description: Please confirm the following before submitting your enhancement request. + options: + - label: I am running the latest code. Mention the version if possible as well. + required: true + - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). + required: true + - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). + required: true + - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share. + required: true + + - type: textarea + id: feature-description + attributes: + label: Feature Description + description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. + placeholder: Detailed description of the enhancement + validations: + required: true + + - type: textarea + id: motivation + attributes: + label: Motivation + description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. + placeholder: Explanation of why this feature is needed and its benefits + validations: + required: true + + - type: textarea + id: possible-implementation + attributes: + label: Possible Implementation + description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. + placeholder: Detailed description of potential implementation + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/06-question.yml b/.github/ISSUE_TEMPLATE/06-question.yml new file mode 100644 index 000000000..23ad2f419 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/06-question.yml @@ -0,0 +1,38 @@ +name: Question template +description: Used to ask questions about llama.cpp +title: "Question: " +labels: ["question"] +body: + - type: markdown + attributes: + value: | + [Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a) + + - type: checkboxes + id: prerequisites + attributes: + label: Prerequisites + description: Please confirm the following before submitting your question. + options: + - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed). + required: true + - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions. + required: true + + - type: textarea + id: background-description + attributes: + label: Background Description + description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question. + placeholder: Detailed description of your question + validations: + required: true + + - type: textarea + id: possible-answer + attributes: + label: Possible Answer + description: If you have some idea of possible answers you want to confirm, that would also be appreciated. + placeholder: Your idea of possible answers + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md deleted file mode 100644 index 49812832c..000000000 --- a/.github/ISSUE_TEMPLATE/bug.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -name: Bug template -about: Used to report bugs in llama.cpp -labels: ["bug-unconfirmed"] -assignees: '' - ---- - -Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. - -If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests). diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md deleted file mode 100644 index dcffda750..000000000 --- a/.github/ISSUE_TEMPLATE/enhancement.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: Enhancement template -about: Used to request enhancements for llama.cpp -labels: ["enhancement"] -assignees: '' - ---- - -# Prerequisites - -Please answer the following questions for yourself before submitting an issue. - -- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. -- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). -- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). -- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share. - -# Feature Description - -Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. - -# Motivation - -Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. - -# Possible Implementation - -If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. From eaf6e031741ca2d3aafeff3e0f4dd7557a974d2b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 27 May 2024 09:24:13 +0300 Subject: [PATCH 07/13] llama : add comments about experimental flags (#7544) --- llama.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.h b/llama.h index 7671b8a57..3e4474bb9 100644 --- a/llama.h +++ b/llama.h @@ -265,6 +265,8 @@ extern "C" { bool check_tensors; // validate model tensor data }; + // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations + // https://github.com/ggerganov/llama.cpp/pull/7544 struct llama_context_params { uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model @@ -291,14 +293,14 @@ extern "C" { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; - enum ggml_type type_k; // data type for K cache - enum ggml_type type_v; // data type for V cache + enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] + enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] // Keep the booleans together to avoid misalignment during copy-by-value. bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] // Abort callback // if it returns true, execution of llama_decode() will be aborted From 62bfef5194d5582486d62da3db59bf44981b7912 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 27 May 2024 10:38:39 +0300 Subject: [PATCH 08/13] metal : disable FA kernel for HS=256 (#7556) ggml-ci --- ggml-metal.m | 15 +++++++++------ ggml-metal.metal | 4 ++-- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index c9e570dbf..15fb68fc4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -184,9 +184,9 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, - GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, - GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, + //GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261 GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, @@ -634,9 +634,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); @@ -770,6 +770,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_LEAKY_RELU: return true; case GGML_OP_FLASH_ATTN_EXT: + if (op->src[0]->ne[0] == 256) { + return false; + } return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: @@ -2573,7 +2576,7 @@ static enum ggml_status ggml_metal_graph_compute( case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; - case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; default: { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); @@ -2586,7 +2589,7 @@ static enum ggml_status ggml_metal_graph_compute( switch (ne00) { case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; - case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + //case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; default: { GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); diff --git a/ggml-metal.metal b/ggml-metal.metal index 8ff70d7a7..ce51c74d5 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2418,7 +2418,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>; template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>; template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>; -template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>; +//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>; template // head size, queries per threadgroup, cache items per threadgroup kernel void kernel_flash_attn_ext_vec_f16( @@ -2696,7 +2696,7 @@ kernel void kernel_flash_attn_ext_vec_f16( } template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>; -template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>; +//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>; kernel void kernel_cpy_f16_f16( device const half * src0, From 1d8fca72ae9154eec0e1c0a75cfaac3c50f08e4a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 27 May 2024 12:10:19 +0300 Subject: [PATCH 09/13] metal : add GGML_OP_REPEAT kernels (#7557) ggml-ci --- ggml-metal.m | 53 ++++++++++++++++++++++++++++++++++++++++++++---- ggml-metal.metal | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 4 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 15fb68fc4..ff9ae55aa 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -35,6 +35,10 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_ROW, GGML_METAL_KERNEL_TYPE_DIV, GGML_METAL_KERNEL_TYPE_DIV_ROW, + GGML_METAL_KERNEL_TYPE_REPEAT_F32, + GGML_METAL_KERNEL_TYPE_REPEAT_F16, + GGML_METAL_KERNEL_TYPE_REPEAT_I32, + GGML_METAL_KERNEL_TYPE_REPEAT_I16, GGML_METAL_KERNEL_TYPE_SCALE, GGML_METAL_KERNEL_TYPE_SCALE_4, GGML_METAL_KERNEL_TYPE_CLAMP, @@ -485,6 +489,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true); @@ -746,6 +754,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_ACC: case GGML_OP_MUL: case GGML_OP_DIV: + case GGML_OP_REPEAT: case GGML_OP_SCALE: case GGML_OP_CLAMP: case GGML_OP_SQR: @@ -979,8 +988,6 @@ static enum ggml_status ggml_metal_graph_compute( switch (dst->op) { case GGML_OP_CONCAT: { - const int64_t nb = ne00; - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline; [encoder setComputePipelineState:pipeline]; @@ -1011,7 +1018,6 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24]; [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25]; [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26]; - [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; const int nth = MIN(1024, ne0); @@ -1021,11 +1027,14 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_OP_MUL: case GGML_OP_DIV: { + GGML_ASSERT(src0t == GGML_TYPE_F32); + GGML_ASSERT(src1t == GGML_TYPE_F32); + const size_t offs = 0; bool bcast_row = false; - int64_t nb = ne00; + int64_t nb = ne00; // used by the "row" kernels id pipeline = nil; @@ -1094,6 +1103,42 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } } break; + case GGML_OP_REPEAT: + { + id pipeline; + + switch (src0t) { + case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break; + case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; + case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; + case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; + default: GGML_ASSERT(false); + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17]; + + const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0); + + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_ACC: { GGML_ASSERT(src0t == GGML_TYPE_F32); diff --git a/ggml-metal.metal b/ggml-metal.metal index ce51c74d5..174086b5b 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -168,6 +168,53 @@ kernel void kernel_div( } } +template +kernel void kernel_repeat( + device const char * src0, + device char * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i3 = tgpig.z; + const int64_t i2 = tgpig.y; + const int64_t i1 = tgpig.x; + + const int64_t i03 = i3 % ne03; + const int64_t i02 = i2 % ne02; + const int64_t i01 = i1 % ne01; + + device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01; + device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ; + + for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) { + const int i00 = i0 % ne00; + *((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00)); + } +} + +typedef decltype(kernel_repeat) kernel_repeat_t; + +template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; +template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; + // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_add_row( From 5487593bc7ee0b65b9d2e2985b4b61dc77043101 Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Mon, 27 May 2024 13:34:09 +0100 Subject: [PATCH 10/13] Add freq factors (#7495) --- ggml-sycl.cpp | 94 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 37 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 496ec61c3..f329bc272 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -8830,12 +8830,11 @@ static void rope( dst[i + 1] = x0*sin_theta + x1*cos_theta; } -template +template static void rope_neox( const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims -, - const sycl::nd_item<3> &item_ct1) { + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, + const float * freq_factors, const sycl::nd_item<3> &item_ct1) { const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1)); @@ -8863,8 +8862,10 @@ static void rope_neox( float cur_rot = inv_ndims * ic - ib; const int p = has_pos ? pos[i2] : 0; + const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f; + const float theta_base = - p * freq_scale * dpct::pow(theta_scale, col / 2.0f); + p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor; float cos_theta, sin_theta; rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); @@ -12413,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows, const int32_t *pos, float freq_scale, int p_delta_rows, float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, - dpct::queue_ptr stream) { + const float * freq_factors, dpct::queue_ptr stream) { GGML_ASSERT(ncols % 2 == 0); const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1); const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE); @@ -12423,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows, const float inv_ndims = -1.0f / n_dims; if (pos == nullptr) { - /* - DPCT1049:42: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, - item_ct1); - }); + if (freq_factors == nullptr) { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, freq_factors, + item_ct1); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, freq_factors, + item_ct1); + }); + } } else { - /* - DPCT1049:43: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - rope_neox(x, dst, ncols, n_dims, pos, freq_scale, - p_delta_rows, ext_factor, attn_factor, - corr_dims, theta_scale, inv_ndims, item_ct1); - }); + if (freq_factors == nullptr) { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); + }); + } else { + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + rope_neox(x, dst, ncols, n_dims, pos, freq_scale, + p_delta_rows, ext_factor, attn_factor, + corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1); + }); + } } } @@ -13986,9 +13997,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, float *dst_dd, const dpct::queue_ptr &main_stream) { -#pragma message("TODO: implement phi3 frequency factors support") -#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225") - GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet"); + const ggml_tensor * src2 = dst->src[2]; GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); @@ -14014,6 +14023,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + const float * freq_factors = nullptr; const int32_t * pos = nullptr; if ((mode & 1) == 0) { GGML_ASSERT(src1->type == GGML_TYPE_I32); @@ -14024,6 +14034,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, const bool is_neox = mode & 2; const bool is_glm = mode & 4; + if (is_neox) { + pos = (const int32_t *) src1_dd; + + if (src2 != nullptr) { + freq_factors = (const float *) src2->data; + } + } else { + GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox"); + } + rope_corr_dims corr_dims; ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); @@ -14035,13 +14055,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1, if (src0->type == GGML_TYPE_F32) { rope_neox_sycl( (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream + attn_factor, corr_dims, freq_factors, main_stream ); } else if (src0->type == GGML_TYPE_F16) { rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, attn_factor, corr_dims, - main_stream); + freq_factors, main_stream); } else { GGML_ASSERT(false); } From 95f84d5ce8b449a9b16009434aca800df504a02e Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Mon, 27 May 2024 17:34:51 +0100 Subject: [PATCH 11/13] Fix q_xxs using mul_mat_q (#7459) --- ggml-sycl.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index f329bc272..8839f775d 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -15263,6 +15263,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } else { bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); + use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS); if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) { use_mul_mat_q = false; From 197c00681b80f9dea17d11a4436b6b8ef1be0ce8 Mon Sep 17 00:00:00 2001 From: agray3 Date: Mon, 27 May 2024 18:33:42 +0100 Subject: [PATCH 12/13] Allow multiple copy function pointers for CUDA graph kernel param updates (#7565) CUDA graphs require parameter updates to kernels associated with GGML_OP_CPY nodes. Previously the implementation only checked for a single CUDA kernel in such nodes, but this caused a bug in cases where 2 such kernels exist. This fixes the issue by using a vector to allow multiple function pointers to be stored and checked against. Fixes #7942 --- ggml-cuda.cu | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b82167cbf..2a90ee55c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2510,9 +2510,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t bool use_cuda_graph = true; bool cuda_graph_update_required = false; - // pointer to CUDA cpy kernel, which is required to identify + // vector of pointers to CUDA cpy kernels, which are required to identify // kernel parameters which need updated in the graph for each token - void * ggml_cuda_cpy_fn_ptr = nullptr; + std::vector ggml_cuda_cpy_fn_ptrs; if (cuda_ctx->cuda_graph->graph == nullptr) { if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { @@ -2588,9 +2588,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (node->op == GGML_OP_CPY) { // store the copy op parameter which changes with each token. cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); - if (ggml_cuda_cpy_fn_ptr == nullptr) { - // store a pointer to the copy op CUDA kernel to identify it later - ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); + // store a pointer to each copy op CUDA kernel to identify it later + void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); + if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) { + ggml_cuda_cpy_fn_ptrs.push_back(ptr); } } @@ -2720,7 +2721,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured int k = 0; for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { - if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) { + if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) { char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++); cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i])); From 10b1e4587670feba2c7730a645accf8234873113 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 27 May 2024 19:34:40 +0200 Subject: [PATCH 13/13] make: add --device-debug to NVCC debug flags (#7542) --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index fe63cbd60..5caf31cdf 100644 --- a/Makefile +++ b/Makefile @@ -441,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT ifdef LLAMA_DEBUG MK_NVCCFLAGS += -lineinfo endif # LLAMA_DEBUG +ifdef LLAMA_CUDA_DEBUG + MK_NVCCFLAGS += --device-debug +endif # LLAMA_CUDA_DEBUG ifdef LLAMA_CUDA_NVCC NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) else