Merge remote-tracking branch 'upstream/master' into deepseek-v2
This commit is contained in:
commit
98ff6e1b45
61 changed files with 1757 additions and 313 deletions
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/01-bug-low.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Low Severity Bugs
|
||||
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "low severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/02-bug-medium.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Medium Severity Bug
|
||||
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "medium severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/03-bug-high.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: High Severity Bug
|
||||
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "high severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
50
.github/ISSUE_TEMPLATE/04-bug-critical.yml
vendored
Normal file
|
@ -0,0 +1,50 @@
|
|||
name: Critical Severity Bug
|
||||
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
|
||||
title: "Bug: "
|
||||
labels: ["bug-unconfirmed", "critical severity"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
Thanks for taking the time to fill out this bug report!
|
||||
Please include information about your system, the steps to reproduce the bug,
|
||||
and the version of llama.cpp that you are using.
|
||||
If possible, please provide a minimal code example that reproduces the bug.
|
||||
- type: textarea
|
||||
id: what-happened
|
||||
attributes:
|
||||
label: What happened?
|
||||
description: Also tell us, what did you expect to happen?
|
||||
placeholder: Tell us what you see!
|
||||
validations:
|
||||
required: true
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
label: Name and Version
|
||||
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
|
||||
placeholder: |
|
||||
$./main --version
|
||||
version: 2999 (42b4109e)
|
||||
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
|
||||
validations:
|
||||
required: true
|
||||
- type: dropdown
|
||||
id: operating-system
|
||||
attributes:
|
||||
label: What operating system are you seeing the problem on?
|
||||
multiple: true
|
||||
options:
|
||||
- Linux
|
||||
- Mac
|
||||
- Windows
|
||||
- BSD
|
||||
- Other? (Please let us know in description)
|
||||
validations:
|
||||
required: false
|
||||
- type: textarea
|
||||
id: logs
|
||||
attributes:
|
||||
label: Relevant log output
|
||||
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||
render: shell
|
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
51
.github/ISSUE_TEMPLATE/05-enhancement.yml
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
name: Enhancement template
|
||||
description: Used to request enhancements for llama.cpp
|
||||
title: "Feature Request: "
|
||||
labels: ["enhancement"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your enhancement request.
|
||||
options:
|
||||
- label: I am running the latest code. Mention the version if possible as well.
|
||||
required: true
|
||||
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
required: true
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: feature-description
|
||||
attributes:
|
||||
label: Feature Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
placeholder: Detailed description of the enhancement
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: motivation
|
||||
attributes:
|
||||
label: Motivation
|
||||
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
placeholder: Explanation of why this feature is needed and its benefits
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-implementation
|
||||
attributes:
|
||||
label: Possible Implementation
|
||||
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
||||
placeholder: Detailed description of potential implementation
|
||||
validations:
|
||||
required: false
|
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
Normal file
|
@ -0,0 +1,38 @@
|
|||
name: Question template
|
||||
description: Used to ask questions about llama.cpp
|
||||
title: "Question: "
|
||||
labels: ["question"]
|
||||
body:
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
[Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
label: Prerequisites
|
||||
description: Please confirm the following before submitting your question.
|
||||
options:
|
||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
||||
required: true
|
||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
attributes:
|
||||
label: Background Description
|
||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
|
||||
placeholder: Detailed description of your question
|
||||
validations:
|
||||
required: true
|
||||
|
||||
- type: textarea
|
||||
id: possible-answer
|
||||
attributes:
|
||||
label: Possible Answer
|
||||
description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
|
||||
placeholder: Your idea of possible answers
|
||||
validations:
|
||||
required: false
|
11
.github/ISSUE_TEMPLATE/bug.md
vendored
11
.github/ISSUE_TEMPLATE/bug.md
vendored
|
@ -1,11 +0,0 @@
|
|||
---
|
||||
name: Bug template
|
||||
about: Used to report bugs in llama.cpp
|
||||
labels: ["bug-unconfirmed"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
|
||||
|
||||
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
|
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
28
.github/ISSUE_TEMPLATE/enhancement.md
vendored
|
@ -1,28 +0,0 @@
|
|||
---
|
||||
name: Enhancement template
|
||||
about: Used to request enhancements for llama.cpp
|
||||
labels: ["enhancement"]
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
# Prerequisites
|
||||
|
||||
Please answer the following questions for yourself before submitting an issue.
|
||||
|
||||
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
|
||||
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
|
||||
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
|
||||
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
|
||||
|
||||
# Feature Description
|
||||
|
||||
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
|
||||
|
||||
# Motivation
|
||||
|
||||
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
|
||||
|
||||
# Possible Implementation
|
||||
|
||||
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
|
14
.github/labeler.yml
vendored
14
.github/labeler.yml
vendored
|
@ -1,5 +1,16 @@
|
|||
# https://github.com/actions/labeler
|
||||
|
||||
Kompute:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-kompute.h
|
||||
- ggml-kompute.cpp
|
||||
- README-kompute.md
|
||||
Apple Metal:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-metal.h
|
||||
- ggml-metal.cpp
|
||||
- README-metal.md
|
||||
SYCL:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
|
@ -9,6 +20,7 @@ SYCL:
|
|||
Nvidia GPU:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- ggml-cuda.h
|
||||
- ggml-cuda/**
|
||||
Vulkan:
|
||||
- changed-files:
|
||||
|
|
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
|
@ -42,8 +42,9 @@ jobs:
|
|||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
||||
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
|
|
|
@ -72,6 +72,7 @@ else()
|
|||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
|
@ -1040,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
|||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (LLAMA_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
{
|
||||
{
|
||||
"version": 4,
|
||||
"configurePresets": [
|
||||
{
|
||||
|
@ -40,6 +40,10 @@
|
|||
|
||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
|
||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
||||
|
||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
||||
]
|
||||
}
|
||||
|
|
3
Makefile
3
Makefile
|
@ -441,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT
|
|||
ifdef LLAMA_DEBUG
|
||||
MK_NVCCFLAGS += -lineinfo
|
||||
endif # LLAMA_DEBUG
|
||||
ifdef LLAMA_CUDA_DEBUG
|
||||
MK_NVCCFLAGS += --device-debug
|
||||
endif # LLAMA_CUDA_DEBUG
|
||||
ifdef LLAMA_CUDA_NVCC
|
||||
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
|
||||
else
|
||||
|
|
|
@ -203,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
|||
|
||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||
|
||||
**Tools:**
|
||||
|
||||
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||
|
|
|
@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.interactive_specials = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--special") {
|
||||
params.special = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
return true;
|
||||
|
@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" --version show version and build info\n");
|
||||
printf(" -i, --interactive run in interactive mode\n");
|
||||
printf(" --special special tokens output enabled\n");
|
||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||
|
@ -1855,11 +1860,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
|||
|
||||
std::string fs_get_cache_directory() {
|
||||
std::string cache_directory = "";
|
||||
auto ensure_trailing_slash = [](std::string p) {
|
||||
// Make sure to add trailing slash
|
||||
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||
p += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return p;
|
||||
};
|
||||
if (getenv("LLAMA_CACHE")) {
|
||||
cache_directory = std::getenv("LLAMA_CACHE");
|
||||
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else {
|
||||
#ifdef __linux__
|
||||
if (std::getenv("XDG_CACHE_HOME")) {
|
||||
|
@ -1870,12 +1879,12 @@ std::string fs_get_cache_directory() {
|
|||
#elif defined(__APPLE__)
|
||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||
#elif defined(_WIN32)
|
||||
cache_directory = std::getenv("APPDATA");
|
||||
cache_directory = std::getenv("LOCALAPPDATA");
|
||||
#endif // __linux__
|
||||
cache_directory = ensure_trailing_slash(cache_directory);
|
||||
cache_directory += "llama.cpp";
|
||||
cache_directory += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
return cache_directory;
|
||||
return ensure_trailing_slash(cache_directory);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2840,6 +2849,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||
|
|
|
@ -146,6 +146,7 @@ struct gpt_params {
|
|||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||
bool special = false; // enable special token output
|
||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||
|
|
|
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
|||
|
||||
params.custom_n_ctx = false;
|
||||
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_checkpointing = true;
|
||||
|
||||
params.sample_start = "";
|
||||
|
|
|
@ -81,6 +81,7 @@ models = [
|
|||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||
]
|
||||
|
||||
|
||||
|
|
|
@ -313,11 +313,10 @@ class Model:
|
|||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
||||
)}}}"""
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
@ -474,6 +473,9 @@ class Model:
|
|||
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
|
||||
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
|
||||
res = "jina-v2-de"
|
||||
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
|
||||
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
|
||||
res = "smaug-bpe"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
|
@ -2393,7 +2395,8 @@ class CommandR2Model(Model):
|
|||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
# aya-23 models don't have model_max_length specified
|
||||
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
|
|
|
@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
|||
|
||||
params.samples_start_after_nl = false;
|
||||
params.use_adam = true;
|
||||
params.use_flash = true;
|
||||
params.use_flash = false;
|
||||
params.use_scratch = true;
|
||||
|
||||
// only adam
|
||||
|
|
|
@ -7,8 +7,6 @@ android {
|
|||
namespace = "com.example.llama"
|
||||
compileSdk = 34
|
||||
|
||||
ndkVersion = "26.1.10909125"
|
||||
|
||||
defaultConfig {
|
||||
applicationId = "com.example.llama"
|
||||
minSdk = 33
|
||||
|
@ -20,17 +18,6 @@ android {
|
|||
vectorDrawables {
|
||||
useSupportLibrary = true
|
||||
}
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
|
@ -55,17 +42,6 @@ android {
|
|||
composeOptions {
|
||||
kotlinCompilerExtensionVersion = "1.5.1"
|
||||
}
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
@ -78,6 +54,7 @@ dependencies {
|
|||
implementation("androidx.compose.ui:ui-graphics")
|
||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||
implementation("androidx.compose.material3:material3")
|
||||
implementation(project(":llama"))
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package com.example.llama
|
||||
|
||||
import android.llama.cpp.LLamaAndroid
|
||||
import android.util.Log
|
||||
import androidx.compose.runtime.getValue
|
||||
import androidx.compose.runtime.mutableStateOf
|
||||
|
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
|||
import kotlinx.coroutines.flow.catch
|
||||
import kotlinx.coroutines.launch
|
||||
|
||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||
companion object {
|
||||
@JvmStatic
|
||||
private val NanosPerSecond = 1_000_000_000.0
|
||||
|
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.unload()
|
||||
llamaAndroid.unload()
|
||||
} catch (exc: IllegalStateException) {
|
||||
messages += exc.message!!
|
||||
}
|
||||
|
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||
messages += ""
|
||||
|
||||
viewModelScope.launch {
|
||||
llm.send(text)
|
||||
llamaAndroid.send(text)
|
||||
.catch {
|
||||
Log.e(tag, "send() failed", it)
|
||||
messages += it.message!!
|
||||
|
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||
viewModelScope.launch {
|
||||
try {
|
||||
val start = System.nanoTime()
|
||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
||||
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||
val end = System.nanoTime()
|
||||
|
||||
messages += warmupResult
|
||||
|
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||
return@launch
|
||||
}
|
||||
|
||||
messages += llm.bench(512, 128, 1, 3)
|
||||
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "bench() failed", exc)
|
||||
messages += exc.message!!
|
||||
|
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
|||
fun load(pathToModel: String) {
|
||||
viewModelScope.launch {
|
||||
try {
|
||||
llm.load(pathToModel)
|
||||
llamaAndroid.load(pathToModel)
|
||||
messages += "Loaded $pathToModel"
|
||||
} catch (exc: IllegalStateException) {
|
||||
Log.e(tag, "load() failed", exc)
|
||||
|
|
|
@ -2,4 +2,5 @@
|
|||
plugins {
|
||||
id("com.android.application") version "8.2.0" apply false
|
||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||
id("com.android.library") version "8.2.0" apply false
|
||||
}
|
||||
|
|
1
examples/llama.android/llama/.gitignore
vendored
Normal file
1
examples/llama.android/llama/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
/build
|
|
@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
|
|||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
68
examples/llama.android/llama/build.gradle.kts
Normal file
68
examples/llama.android/llama/build.gradle.kts
Normal file
|
@ -0,0 +1,68 @@
|
|||
plugins {
|
||||
id("com.android.library")
|
||||
id("org.jetbrains.kotlin.android")
|
||||
}
|
||||
|
||||
android {
|
||||
namespace = "android.llama.cpp"
|
||||
compileSdk = 34
|
||||
|
||||
defaultConfig {
|
||||
minSdk = 33
|
||||
|
||||
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||
consumerProguardFiles("consumer-rules.pro")
|
||||
ndk {
|
||||
// Add NDK properties if wanted, e.g.
|
||||
// abiFilters += listOf("arm64-v8a")
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
|
||||
cppFlags("")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
release {
|
||||
isMinifyEnabled = false
|
||||
proguardFiles(
|
||||
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||
"proguard-rules.pro"
|
||||
)
|
||||
}
|
||||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path("src/main/cpp/CMakeLists.txt")
|
||||
version = "3.22.1"
|
||||
}
|
||||
}
|
||||
compileOptions {
|
||||
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||
targetCompatibility = JavaVersion.VERSION_1_8
|
||||
}
|
||||
kotlinOptions {
|
||||
jvmTarget = "1.8"
|
||||
}
|
||||
|
||||
packaging {
|
||||
resources {
|
||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
|
||||
implementation("androidx.core:core-ktx:1.12.0")
|
||||
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||
implementation("com.google.android.material:material:1.11.0")
|
||||
testImplementation("junit:junit:4.13.2")
|
||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||
}
|
0
examples/llama.android/llama/consumer-rules.pro
Normal file
0
examples/llama.android/llama/consumer-rules.pro
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
# Add project specific ProGuard rules here.
|
||||
# You can control the set of applied configuration files using the
|
||||
# proguardFiles setting in build.gradle.
|
||||
#
|
||||
# For more details, see
|
||||
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||
|
||||
# If your project uses WebView with JS, uncomment the following
|
||||
# and specify the fully qualified class name to the JavaScript interface
|
||||
# class:
|
||||
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||
# public *;
|
||||
#}
|
||||
|
||||
# Uncomment this to preserve the line number information for
|
||||
# debugging stack traces.
|
||||
#-keepattributes SourceFile,LineNumberTable
|
||||
|
||||
# If you keep the line number information, uncomment this to
|
||||
# hide the original source file name.
|
||||
#-renamesourcefileattribute SourceFile
|
|
@ -0,0 +1,24 @@
|
|||
package android.llama.cpp
|
||||
|
||||
import androidx.test.platform.app.InstrumentationRegistry
|
||||
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||
|
||||
import org.junit.Test
|
||||
import org.junit.runner.RunWith
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Instrumented test, which will execute on an Android device.
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
@RunWith(AndroidJUnit4::class)
|
||||
class ExampleInstrumentedTest {
|
||||
@Test
|
||||
fun useAppContext() {
|
||||
// Context of the app under test.
|
||||
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||
|
||||
</manifest>
|
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
|
@ -0,0 +1,49 @@
|
|||
# For more information about using CMake with Android Studio, read the
|
||||
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||
|
||||
# Sets the minimum CMake version required for this project.
|
||||
cmake_minimum_required(VERSION 3.22.1)
|
||||
|
||||
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||
# build script scope).
|
||||
project("llama-android")
|
||||
|
||||
include(FetchContent)
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||
GIT_TAG master
|
||||
)
|
||||
|
||||
# Also provides "common"
|
||||
FetchContent_MakeAvailable(llama)
|
||||
|
||||
# Creates and names a library, sets it as either STATIC
|
||||
# or SHARED, and provides the relative paths to its source code.
|
||||
# You can define multiple libraries, and CMake builds them for you.
|
||||
# Gradle automatically packages shared libraries with your APK.
|
||||
#
|
||||
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||
# is preferred for the same purpose.
|
||||
#
|
||||
# In order to load a library into your app from Java/Kotlin, you must call
|
||||
# System.loadLibrary() and pass the name of the library defined here;
|
||||
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||
# used in the AndroidManifest.xml file.
|
||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||
llama-android.cpp)
|
||||
|
||||
# Specifies libraries CMake should link to your target library. You
|
||||
# can link libraries from various origins, such as libraries defined in this
|
||||
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||
# List libraries link to the target library
|
||||
llama
|
||||
common
|
||||
android
|
||||
log)
|
|
@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||
|
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||
|
||||
if (!model) {
|
||||
|
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||
llama_free(reinterpret_cast<llama_context *>(context));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||
llama_log_set(log_callback, NULL);
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_bench_1model(
|
||||
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
|
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jlong JNICALL
|
||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||
|
||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||
|
||||
|
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||
llama_backend_init();
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
||||
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||
return env->NewStringUTF(llama_print_system_info());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
JNIEXPORT jint JNICALL
|
||||
Java_com_example_llama_Llm_completion_1init(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||
JNIEnv *env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
|
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT jstring JNICALL
|
||||
Java_com_example_llama_Llm_completion_1loop(
|
||||
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||
JNIEnv * env,
|
||||
jobject,
|
||||
jlong context_pointer,
|
||||
|
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
package com.example.llama
|
||||
package android.llama.cpp
|
||||
|
||||
import android.util.Log
|
||||
import kotlinx.coroutines.CoroutineDispatcher
|
||||
|
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
|||
import java.util.concurrent.Executors
|
||||
import kotlin.concurrent.thread
|
||||
|
||||
class Llm {
|
||||
class LLamaAndroid {
|
||||
private val tag: String? = this::class.simpleName
|
||||
|
||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||
|
@ -165,8 +165,8 @@ class Llm {
|
|||
}
|
||||
|
||||
// Enforce only one instance of Llm.
|
||||
private val _instance: Llm = Llm()
|
||||
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||
|
||||
fun instance(): Llm = _instance
|
||||
fun instance(): LLamaAndroid = _instance
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
package android.llama.cpp
|
||||
|
||||
import org.junit.Test
|
||||
|
||||
import org.junit.Assert.*
|
||||
|
||||
/**
|
||||
* Example local unit test, which will execute on the development machine (host).
|
||||
*
|
||||
* See [testing documentation](http://d.android.com/tools/testing).
|
||||
*/
|
||||
class ExampleUnitTest {
|
||||
@Test
|
||||
fun addition_isCorrect() {
|
||||
assertEquals(4, 2 + 2)
|
||||
}
|
||||
}
|
|
@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
|||
|
||||
rootProject.name = "LlamaAndroid"
|
||||
include(":app")
|
||||
include(":llama")
|
||||
|
|
|
@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
|||
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
|
|
@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
|||
// display text
|
||||
if (input_echo && display) {
|
||||
for (auto id : embd) {
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||
printf("%s", token_str.c_str());
|
||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||
|
||||
// Console/Stream Output
|
||||
fprintf(stdout, "%s", token_str.c_str());
|
||||
|
||||
// Record Displayed Tokens To Log
|
||||
// Note: Generated tokens are created one by one hence this check
|
||||
if (embd.size() > 1) {
|
||||
// Incoming Requested Tokens
|
||||
input_tokens.push_back(id);
|
||||
} else {
|
||||
// Outgoing Generated Tokens
|
||||
output_tokens.push_back(id);
|
||||
output_ss << token_str;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
// reset color to default if there is no pending user input
|
||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||
console::set_display(console::reset);
|
||||
|
|
|
@ -594,7 +594,7 @@
|
|||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
message = isCompletionMode ?
|
||||
text :
|
||||
|
@ -877,19 +877,30 @@
|
|||
|
||||
// poor mans markdown replacement
|
||||
const Markdownish = (params) => {
|
||||
const md = params.text
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||
.replace(/\n/gim, '<br />');
|
||||
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
||||
const chunks = params.text.split('```');
|
||||
|
||||
for (let i = 0; i < chunks.length; i++) {
|
||||
if (i % 2 === 0) { // outside code block
|
||||
chunks[i] = chunks[i]
|
||||
.replace(/&/g, '&')
|
||||
.replace(/</g, '<')
|
||||
.replace(/>/g, '>')
|
||||
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
|
||||
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
|
||||
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
|
||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||
.replace(/\n/gim, '<br />');
|
||||
} else { // inside code block
|
||||
chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
|
||||
}
|
||||
}
|
||||
|
||||
const restoredText = chunks.join('');
|
||||
|
||||
return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
|
||||
};
|
||||
|
||||
const ModelGenerationInfo = (params) => {
|
||||
|
@ -903,6 +914,7 @@
|
|||
`
|
||||
}
|
||||
|
||||
|
||||
// simple popover impl
|
||||
const Popover = (props) => {
|
||||
const isOpen = useSignal(false);
|
||||
|
@ -1054,4 +1066,3 @@
|
|||
</body>
|
||||
|
||||
</html>
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
||||
<title>SimpleChat LlamaCppEtal </title>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
<meta name="message" content="Save Nature Save Earth" />
|
||||
|
@ -30,20 +30,17 @@
|
|||
<hr>
|
||||
<div class="sameline">
|
||||
<label for="system-in">System</label>
|
||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
||||
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div id="chat-div">
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p> You need to have javascript enabled.</p>
|
||||
</div>
|
||||
|
||||
<hr>
|
||||
<div class="sameline">
|
||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
||||
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||
<button id="user-btn">submit</button>
|
||||
</div>
|
||||
|
||||
|
|
|
@ -14,11 +14,15 @@ own system prompts.
|
|||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||
enough manner, in general.
|
||||
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat.
|
||||
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||
console.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
||||
the js file as needed.
|
||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||
form of old messages culling can be achieved.
|
||||
|
||||
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||
they can update the js file or equivalent member in gMe as needed.
|
||||
|
||||
|
||||
## usage
|
||||
|
@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
|||
### using the front end
|
||||
|
||||
Open this simple web front end from your local browser
|
||||
|
||||
* http://127.0.0.1:PORT/index.html
|
||||
|
||||
Once inside
|
||||
|
||||
* Select between chat and completion mode. By default it is set to chat mode.
|
||||
|
||||
* In completion mode
|
||||
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||
If the model requires any prefix wrt user role messages, then the end user has to
|
||||
explicitly add the needed prefix, when they enter their chat message.
|
||||
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||
then the end user needs to enter the same.
|
||||
This keeps the logic simple, while still giving flexibility to the end user to
|
||||
manage any templating/tagging requirement wrt their messages to the model.
|
||||
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||
However if the chat being sent to /completions end point has more than one role's message,
|
||||
then insert newline when moving from one role's message to the next role's message, so
|
||||
that it can be clearly identified/distinguished.
|
||||
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||
own, the above ensures that end user can create a custom single/multi message combo with
|
||||
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||
can use it just for normal completion related/based query.
|
||||
|
||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||
responses with a suitable system prompt.
|
||||
* if chat.add_system_begin is used
|
||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||
* you cant set a system prompt, after you have submitted any user query
|
||||
|
@ -55,27 +81,121 @@ Once inside
|
|||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||
|
||||
* Enter your query and either press enter or click on the submit button.
|
||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||
|
||||
* Wait for the logic to communicate with the server and get the response.
|
||||
* the user is not allowed to enter any fresh query during this time.
|
||||
* the user input box will be disabled and a working message will be shown in it.
|
||||
|
||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||
|
||||
* Using NewChat one can start independent chat sessions.
|
||||
* two independent chat sessions are setup by default.
|
||||
|
||||
|
||||
## Devel note
|
||||
|
||||
### Reason behind this
|
||||
|
||||
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||
|
||||
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||
to explore some of the end points and ideas/implications around them.
|
||||
|
||||
|
||||
### General
|
||||
|
||||
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||
|
||||
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||
communicating with the server or only sends the latest user query/message.
|
||||
|
||||
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||
only user messages after the latest system message/prompt will be considered.
|
||||
|
||||
This specified sliding window user message count also includes the latest user query.
|
||||
<0 : Send entire chat history to server
|
||||
0 : Send only the system message if any to the server
|
||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||
|
||||
|
||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||
not been added, for now.
|
||||
|
||||
|
||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||
the system prompt, anytime during the conversation or only at the beginning.
|
||||
|
||||
|
||||
read_json_early, is to experiment with reading json response data early on, if available,
|
||||
so that user can be shown generated data, as and when it is being generated, rather than
|
||||
at the end when full data is available.
|
||||
|
||||
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||
that is currently sent.
|
||||
|
||||
if able to read json data early on in future, as and when ai model is generating data, then
|
||||
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||
for the overall data to be available.
|
||||
|
||||
|
||||
### Default setup
|
||||
|
||||
By default things are setup to try and make the user experience a bit better, if possible.
|
||||
However a developer when testing the server of ai-model may want to change these value.
|
||||
|
||||
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||
mess with things beyond the next question/request/query, in some ways.
|
||||
|
||||
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||
available wrt next query-response. However dont forget that the server when started should
|
||||
also be started with a model context size of 1k or more, to be on safe side.
|
||||
|
||||
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||
its response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||
|
||||
|
||||
## At the end
|
||||
|
||||
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||
|
|
|
@ -48,6 +48,13 @@ button {
|
|||
flex-direction: column;
|
||||
}
|
||||
|
||||
.ul1 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
.ul2 {
|
||||
padding-inline-start: 2vw;
|
||||
}
|
||||
|
||||
* {
|
||||
margin: 0.6vmin;
|
||||
}
|
||||
|
|
|
@ -14,23 +14,86 @@ class ApiEP {
|
|||
}
|
||||
|
||||
let gUsageMsg = `
|
||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
||||
<p> Enter your text to the ai assistant below.</p>
|
||||
<p> Use shift+enter for inserting enter.</p>
|
||||
<p> Refresh the page to start over fresh.</p>
|
||||
<p class="role-system">Usage</p>
|
||||
<ul class="ul1">
|
||||
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode normally wont have a system prompt.</li>
|
||||
</ul>
|
||||
<li> Enter your query to ai assistant below.</li>
|
||||
<ul class="ul2">
|
||||
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||
<li> Use shift+enter for inserting enter/newline.</li>
|
||||
</ul>
|
||||
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||
<ul class="ul2">
|
||||
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||
</ul>
|
||||
</ul>
|
||||
`;
|
||||
|
||||
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||
|
||||
class SimpleChat {
|
||||
|
||||
constructor() {
|
||||
/**
|
||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||
* @type {{role: string, content: string}[]}
|
||||
* @type {ChatMessages}
|
||||
*/
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
clear() {
|
||||
this.xchat = [];
|
||||
this.iLastSys = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recent chat messages.
|
||||
* If iRecentUserMsgCnt < 0
|
||||
* Then return the full chat history
|
||||
* Else
|
||||
* Return chat messages from latest going back till the last/latest system prompt.
|
||||
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||
* @param {number} iRecentUserMsgCnt
|
||||
*/
|
||||
recent_chat(iRecentUserMsgCnt) {
|
||||
if (iRecentUserMsgCnt < 0) {
|
||||
return this.xchat;
|
||||
}
|
||||
if (iRecentUserMsgCnt == 0) {
|
||||
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||
}
|
||||
/** @type{ChatMessages} */
|
||||
let rchat = [];
|
||||
let sysMsg = this.get_system_latest();
|
||||
if (sysMsg.length != 0) {
|
||||
rchat.push({role: Roles.System, content: sysMsg});
|
||||
}
|
||||
let iUserCnt = 0;
|
||||
let iStart = this.xchat.length;
|
||||
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||
break;
|
||||
}
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.User) {
|
||||
iStart = i;
|
||||
iUserCnt += 1;
|
||||
}
|
||||
}
|
||||
for(let i = iStart; i < this.xchat.length; i++) {
|
||||
let msg = this.xchat[i];
|
||||
if (msg.role == Roles.System) {
|
||||
continue;
|
||||
}
|
||||
rchat.push({role: msg.role, content: msg.content});
|
||||
}
|
||||
return rchat;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an entry into xchat
|
||||
* @param {string} role
|
||||
|
@ -57,7 +120,7 @@ class SimpleChat {
|
|||
div.replaceChildren();
|
||||
}
|
||||
let last = undefined;
|
||||
for(const x of this.xchat) {
|
||||
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
let entry = document.createElement("p");
|
||||
entry.className = `role-${x.role}`;
|
||||
entry.innerText = `${x.role}: ${x.content}`;
|
||||
|
@ -69,17 +132,21 @@ class SimpleChat {
|
|||
} else {
|
||||
if (bClear) {
|
||||
div.innerHTML = gUsageMsg;
|
||||
gMe.show_info(div);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
* The needed fields/options are picked from a global object.
|
||||
* Convert the json into string.
|
||||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr(obj) {
|
||||
obj["temperature"] = 0.7;
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
}
|
||||
return JSON.stringify(obj);
|
||||
}
|
||||
|
||||
|
@ -88,18 +155,27 @@ class SimpleChat {
|
|||
*/
|
||||
request_messages_jsonstr() {
|
||||
let req = {
|
||||
messages: this.xchat,
|
||||
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||
}
|
||||
return this.request_jsonstr(req);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string form of json object suitable for /completions
|
||||
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||
*/
|
||||
request_prompt_jsonstr() {
|
||||
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||
let prompt = "";
|
||||
for(const chat of this.xchat) {
|
||||
prompt += `${chat.role}: ${chat.content}\n`;
|
||||
let iCnt = 0;
|
||||
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||
iCnt += 1;
|
||||
if (iCnt > 1) {
|
||||
prompt += "\n";
|
||||
}
|
||||
if (bInsertStandardRolePrefix) {
|
||||
prompt += `${chat.role}: `;
|
||||
}
|
||||
prompt += `${chat.content}`;
|
||||
}
|
||||
let req = {
|
||||
prompt: prompt,
|
||||
|
@ -171,7 +247,6 @@ let gChatURL = {
|
|||
'chat': `${gBaseURL}/chat/completions`,
|
||||
'completion': `${gBaseURL}/completions`,
|
||||
}
|
||||
const gbCompletionFreshChatAlways = true;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -291,6 +366,8 @@ class MultiChatUI {
|
|||
// allow user to insert enter into their message using shift+enter.
|
||||
// while just pressing enter key will lead to submitting.
|
||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||
let value = this.elInUser.value;
|
||||
this.elInUser.value = value.substring(0,value.length-1);
|
||||
this.elBtnUser.click();
|
||||
ev.preventDefault();
|
||||
}
|
||||
|
@ -321,6 +398,29 @@ class MultiChatUI {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try read json response early, if available.
|
||||
* @param {Response} resp
|
||||
*/
|
||||
async read_json_early(resp) {
|
||||
if (!resp.body) {
|
||||
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||
}
|
||||
let tdUtf8 = new TextDecoder("utf-8");
|
||||
let rr = resp.body.getReader();
|
||||
let gotBody = "";
|
||||
while(true) {
|
||||
let { value: cur, done: done} = await rr.read();
|
||||
let curBody = tdUtf8.decode(cur);
|
||||
console.debug("DBUG:SC:PART:", curBody);
|
||||
gotBody += curBody;
|
||||
if (done) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return JSON.parse(gotBody);
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle user query submit request, wrt specified chat session.
|
||||
* @param {string} chatId
|
||||
|
@ -330,6 +430,14 @@ class MultiChatUI {
|
|||
|
||||
let chat = this.simpleChats[chatId];
|
||||
|
||||
// In completion mode, if configured, clear any previous chat history.
|
||||
// So if user wants to simulate a multi-chat based completion query,
|
||||
// they will have to enter the full thing, as a suitable multiline
|
||||
// user input/query.
|
||||
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||
chat.clear();
|
||||
}
|
||||
|
||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||
|
||||
let content = this.elInUser.value;
|
||||
|
@ -344,7 +452,7 @@ class MultiChatUI {
|
|||
if (apiEP == ApiEP.Chat) {
|
||||
theBody = chat.request_messages_jsonstr();
|
||||
} else {
|
||||
theBody = chat.request_prompt_jsonstr();
|
||||
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||
}
|
||||
|
||||
this.elInUser.value = "working...";
|
||||
|
@ -359,6 +467,7 @@ class MultiChatUI {
|
|||
});
|
||||
|
||||
let respBody = await resp.json();
|
||||
//let respBody = await this.read_json_early(resp);
|
||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||
let assistantMsg;
|
||||
if (apiEP == ApiEP.Chat) {
|
||||
|
@ -376,13 +485,6 @@ class MultiChatUI {
|
|||
} else {
|
||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||
}
|
||||
// Purposefully clear at end rather than begin of this function
|
||||
// so that one can switch from chat to completion mode and sequece
|
||||
// in a completion mode with multiple user-assistant chat data
|
||||
// from before to be sent/occur once.
|
||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
||||
chat.xchat.length = 0;
|
||||
}
|
||||
this.ui_reset_userinput();
|
||||
}
|
||||
|
||||
|
@ -462,17 +564,66 @@ class MultiChatUI {
|
|||
}
|
||||
|
||||
|
||||
let gMuitChat;
|
||||
const gChatIds = [ "Default", "Other" ];
|
||||
class Me {
|
||||
|
||||
constructor() {
|
||||
this.defaultChatIds = [ "Default", "Other" ];
|
||||
this.multiChat = new MultiChatUI();
|
||||
this.bCompletionFreshChatAlways = true;
|
||||
this.bCompletionInsertStandardRolePrefix = false;
|
||||
this.iRecentUserMsgCnt = 2;
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"frequency_penalty": 1.2,
|
||||
"presence_penalty": 1.2,
|
||||
"n_predict": 1024
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_info(elDiv) {
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = "Settings (devel-tools-console gMe)";
|
||||
p.className = "role-system";
|
||||
elDiv.appendChild(p);
|
||||
|
||||
var p = document.createElement("p");
|
||||
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
p = document.createElement("p");
|
||||
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||
elDiv.appendChild(p);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** @type {Me} */
|
||||
let gMe;
|
||||
|
||||
function startme() {
|
||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||
gMuitChat = new MultiChatUI();
|
||||
for (let cid of gChatIds) {
|
||||
gMuitChat.new_chat_session(cid);
|
||||
gMe = new Me();
|
||||
for (let cid of gMe.defaultChatIds) {
|
||||
gMe.multiChat.new_chat_session(cid);
|
||||
}
|
||||
gMuitChat.setup_ui(gChatIds[0]);
|
||||
gMuitChat.show_sessions();
|
||||
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||
gMe.multiChat.show_sessions();
|
||||
}
|
||||
|
||||
document.addEventListener("DOMContentLoaded", startme);
|
||||
|
|
|
@ -3,40 +3,390 @@
|
|||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
if (argc < 3 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#include <windows.h>
|
||||
#include <shellapi.h> // For CommandLineToArgvW
|
||||
#endif
|
||||
|
||||
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||
fprintf(stream, " The possible options are:\n");
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, " -h, --help print this help and exit\n");
|
||||
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||
}
|
||||
|
||||
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) text;
|
||||
(void) user_data;
|
||||
}
|
||||
|
||||
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
||||
success = false;
|
||||
|
||||
std::ifstream in(filepath, std::ios::binary);
|
||||
if (!in) {
|
||||
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
// do not assume the file is seekable (e.g. /dev/stdin)
|
||||
std::stringstream buffer;
|
||||
buffer << in.rdbuf();
|
||||
if (in.fail()) {
|
||||
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
||||
return std::string();
|
||||
}
|
||||
|
||||
success = true;
|
||||
return buffer.str();
|
||||
}
|
||||
|
||||
//
|
||||
// Function: ingest_args(...) -> vector<string>
|
||||
//
|
||||
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
||||
// strings, as an STL vector<string>.
|
||||
//
|
||||
// In particular, it handles character encoding shenanigans on Windows.
|
||||
//
|
||||
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
||||
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
||||
// format, ignoring the regular argc/argv arguments to main().
|
||||
//
|
||||
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
||||
// in relation to Windows wchar_t shenanigans.
|
||||
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
||||
std::vector<std::string> argv;
|
||||
|
||||
// Handle Windows, if given non-ASCII arguments.
|
||||
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
||||
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
||||
// without throwing tantrums.
|
||||
#if defined(_WIN32)
|
||||
int argc;
|
||||
const LPWSTR cmdline_wargv = GetCommandLineW();
|
||||
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
||||
|
||||
// silence unused arg warnings
|
||||
(void) raw_argc;
|
||||
(void) raw_argv;
|
||||
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
||||
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
||||
GGML_ASSERT(output_buf);
|
||||
|
||||
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
||||
output_buf[length_needed] = '\0';
|
||||
|
||||
argv.push_back(output_buf);
|
||||
free(output_buf);
|
||||
}
|
||||
|
||||
LocalFree((HLOCAL) wargv);
|
||||
#else
|
||||
int argc = raw_argc;
|
||||
for (int i = 0; i < argc; ++i) {
|
||||
argv.push_back(raw_argv[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
GGML_ASSERT((unsigned int) argc == argv.size());
|
||||
|
||||
return argv;
|
||||
}
|
||||
|
||||
//
|
||||
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
||||
//
|
||||
// writes a string to standard output; taking into account that on Windows
|
||||
// to display correctly you have to use special handling. Works even if the
|
||||
// user has not set a unicode code page on a Windows cmd.exe.
|
||||
//
|
||||
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
||||
// a human-readable is written instead.
|
||||
//
|
||||
// On non-Windows systems, simply printfs() the string.
|
||||
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||
invalid_utf8 = false;
|
||||
|
||||
#if defined(_WIN32)
|
||||
// Are we in a console?
|
||||
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||
DWORD dwMode = 0;
|
||||
|
||||
// According to Microsoft docs:
|
||||
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
||||
// Also according to the docs, you can use GetConsoleMode to check for that.
|
||||
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
||||
printf("%s", str);
|
||||
return;
|
||||
}
|
||||
|
||||
// MultiByteToWideChar reports an error if str is empty, don't report
|
||||
// them as invalid_utf8.
|
||||
if (*str == 0) {
|
||||
return;
|
||||
}
|
||||
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
||||
if (length_needed == 0) {
|
||||
DWORD err = GetLastError();
|
||||
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
||||
invalid_utf8 = true;
|
||||
int len = strlen(str);
|
||||
printf("<");
|
||||
for (int i = 0; i < len; ++i) {
|
||||
if (i > 0) {
|
||||
printf(" ");
|
||||
}
|
||||
printf("%02x", (uint8_t) str[i]);
|
||||
}
|
||||
printf(">");
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||
}
|
||||
|
||||
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||
GGML_ASSERT(wstr);
|
||||
|
||||
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
||||
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
||||
|
||||
free(wstr);
|
||||
#else
|
||||
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
||||
// printf will silently just write bad unicode.
|
||||
printf("%s", str);
|
||||
#endif
|
||||
}
|
||||
|
||||
int main(int raw_argc, char ** raw_argv) {
|
||||
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
||||
const int argc = argv.size();
|
||||
|
||||
if (argc <= 1) {
|
||||
print_usage_information(argv[0].c_str(), stderr);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char * model_path = argv[1];
|
||||
const char * prompt = argv[2];
|
||||
//////
|
||||
// Read out all the command line arguments.
|
||||
//////
|
||||
|
||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||
// variables where to put any arguments we see.
|
||||
bool printing_ids = false;
|
||||
bool no_bos = false;
|
||||
bool disable_logging = false;
|
||||
const char * model_path = NULL;
|
||||
const char * prompt_path = NULL;
|
||||
const char * prompt_arg = NULL;
|
||||
|
||||
// track which arguments were explicitly given
|
||||
// used for sanity checking down the line
|
||||
bool model_path_set = false;
|
||||
bool prompt_path_set = false;
|
||||
bool prompt_set = false;
|
||||
bool stdin_set = false;
|
||||
|
||||
int iarg = 1;
|
||||
for (; iarg < argc; ++iarg) {
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
print_usage_information(argv[0].c_str(), stdout);
|
||||
return 0;
|
||||
}
|
||||
else if (arg == "--ids") {
|
||||
printing_ids = true;
|
||||
}
|
||||
else if (arg == "-m" || arg == "--model") {
|
||||
if (model_path_set) {
|
||||
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
model_path = argv[++iarg].c_str();
|
||||
model_path_set = true;
|
||||
}
|
||||
else if (arg == "--no-bos") {
|
||||
no_bos = true;
|
||||
}
|
||||
else if (arg == "-p" || arg == "--prompt") {
|
||||
if (prompt_set) {
|
||||
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_arg = argv[++iarg].c_str();
|
||||
prompt_set = true;
|
||||
}
|
||||
else if (arg == "-f" || arg == "--file") {
|
||||
if (prompt_path_set) {
|
||||
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
||||
return 1;
|
||||
}
|
||||
prompt_path = argv[++iarg].c_str();
|
||||
prompt_path_set = true;
|
||||
}
|
||||
else if (arg == "--stdin") {
|
||||
stdin_set = true;
|
||||
}
|
||||
else if (arg == "--log-disable") {
|
||||
disable_logging = true;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
//////
|
||||
// Sanity check the command line arguments.
|
||||
//////
|
||||
|
||||
// Check that we have the required stuff set.
|
||||
if (model_path_set && model_path == NULL) {
|
||||
fprintf(stderr, "Error: --model requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (!model_path_set) {
|
||||
fprintf(stderr, "Error: must specify --model.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_path_set && prompt_path == NULL) {
|
||||
fprintf(stderr, "Error: --file requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
if (prompt_set && prompt_arg == NULL) {
|
||||
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
||||
return 1;
|
||||
}
|
||||
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
||||
if (prompts_set > 1) {
|
||||
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
||||
return 1;
|
||||
}
|
||||
// Must have some prompt.
|
||||
if (prompts_set == 0) {
|
||||
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(model_path);
|
||||
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
||||
|
||||
//////
|
||||
// Figure out where will the prompt come from.
|
||||
//////
|
||||
|
||||
std::string prompt;
|
||||
if (prompt_path_set) {
|
||||
bool success = false;
|
||||
prompt = read_prompt_from_file(prompt_path, success);
|
||||
if (!success) {
|
||||
return 1;
|
||||
}
|
||||
} else if (prompt_set) {
|
||||
prompt = prompt_arg;
|
||||
} else {
|
||||
GGML_ASSERT(stdin_set);
|
||||
// we read stdin *after* loading model (early exit if model cannot
|
||||
// be loaded, which can be a nicer user experience)
|
||||
}
|
||||
|
||||
//////
|
||||
// Start actually doing the tokenizing stuff.
|
||||
//////
|
||||
|
||||
#ifdef LOG_DISABLE_LOGS
|
||||
disable_logging = true;
|
||||
#endif
|
||||
|
||||
if (disable_logging) {
|
||||
llama_log_set(llama_log_callback_null, NULL);
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.vocab_only = true;
|
||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||
if (!model) {
|
||||
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "Error: could not create context.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// read entire prompt from stdin?
|
||||
if (stdin_set) {
|
||||
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
||||
|
||||
std::stringstream stdin_buffer;
|
||||
stdin_buffer << std::cin.rdbuf();
|
||||
if (std::cin.fail()) {
|
||||
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
prompt = stdin_buffer.str();
|
||||
}
|
||||
|
||||
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||
|
||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
||||
if (printing_ids) {
|
||||
printf("[");
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||
if (printing_ids) {
|
||||
printf("%d\n", tokens[i]);
|
||||
if (i > 0) {
|
||||
printf(", ");
|
||||
}
|
||||
printf("%d", tokens[i]);
|
||||
} else {
|
||||
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
||||
bool invalid_utf8 = false;
|
||||
printf("%6d -> '", tokens[i]);
|
||||
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||
if (invalid_utf8) {
|
||||
printf("' (utf-8 decode failure)\n");
|
||||
} else {
|
||||
printf("'\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (printing_ids) {
|
||||
printf("]\n");
|
||||
}
|
||||
|
||||
// silence valgrind
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
12
flake.lock
generated
12
flake.lock
generated
|
@ -5,11 +5,11 @@
|
|||
"nixpkgs-lib": "nixpkgs-lib"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1714641030,
|
||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
||||
"lastModified": 1715865404,
|
||||
"narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
|
||||
"owner": "hercules-ci",
|
||||
"repo": "flake-parts",
|
||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
||||
"rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1714635257,
|
||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
||||
"lastModified": 1716509168,
|
||||
"narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
||||
"rev": "bfb7a882678e518398ce9a31a881538679f6f092",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
33
ggml-cuda.cu
33
ggml-cuda.cu
|
@ -119,6 +119,20 @@ int ggml_cuda_get_device() {
|
|||
return id;
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
|
||||
}
|
||||
return res;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif
|
||||
}
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
|
@ -271,7 +285,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
size_t look_ahead_size = (size_t) (1.05 * size);
|
||||
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc(&ptr, look_ahead_size, device));
|
||||
*actual_size = look_ahead_size;
|
||||
pool_size += look_ahead_size;
|
||||
#ifdef DEBUG_CUDA_MALLOC
|
||||
|
@ -537,7 +551,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffe
|
|||
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
|
||||
|
||||
void * dev_ptr;
|
||||
cudaError_t err = cudaMalloc(&dev_ptr, size);
|
||||
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
|
||||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
|
@ -798,7 +812,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu
|
|||
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
|
||||
ggml_cuda_set_device(id);
|
||||
char * buf;
|
||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||
CUDA_CHECK(ggml_cuda_device_malloc((void**)&buf, size, id));
|
||||
|
||||
// set padding to 0 to avoid possible NaN values
|
||||
if (size > original_size) {
|
||||
|
@ -2510,9 +2524,9 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
|
||||
bool use_cuda_graph = true;
|
||||
bool cuda_graph_update_required = false;
|
||||
// pointer to CUDA cpy kernel, which is required to identify
|
||||
// vector of pointers to CUDA cpy kernels, which are required to identify
|
||||
// kernel parameters which need updated in the graph for each token
|
||||
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
||||
std::vector<void *> ggml_cuda_cpy_fn_ptrs;
|
||||
|
||||
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
|
@ -2588,9 +2602,10 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
if (node->op == GGML_OP_CPY) {
|
||||
// store the copy op parameter which changes with each token.
|
||||
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
||||
// store a pointer to the copy op CUDA kernel to identify it later
|
||||
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
// store a pointer to each copy op CUDA kernel to identify it later
|
||||
void * ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
ggml_cuda_cpy_fn_ptrs.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2720,7 +2735,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||
int k = 0;
|
||||
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
||||
if(count(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), cuda_ctx->cuda_graph->params[i].func) > 0) {
|
||||
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||
|
|
|
@ -79,13 +79,8 @@
|
|||
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||
#define cudaHostUnregister hipHostUnregister
|
||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||
#ifdef GGML_HIP_UMA
|
||||
#define cudaMalloc hipMallocManaged
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
|
||||
#else
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#endif
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync
|
||||
|
|
|
@ -1,15 +1,68 @@
|
|||
#include "concat.cuh"
|
||||
|
||||
static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) {
|
||||
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
// operation
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (nidx < ne00) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne00 +
|
||||
blockIdx.z * ne00 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
(nidx - ne00) +
|
||||
blockIdx.y * (ne0 - ne00) +
|
||||
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.y < ne01) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * ne01;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
(blockIdx.y - ne01) * ne0 +
|
||||
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.z < ne02) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
|
@ -25,25 +78,53 @@ static __global__ void concat_f32(const float * x,const float * y, float * dst,
|
|||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) {
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, ne1, ne2);
|
||||
concat_f32<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
if (dim == 0) {
|
||||
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||
return;
|
||||
}
|
||||
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
const float * src1_d = (const float *)src1->data;
|
||||
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream);
|
||||
if (dim != 3) {
|
||||
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
concat_f32_cuda(
|
||||
src0_d + i3 * (src0->nb[3] / 4),
|
||||
src1_d + i3 * (src1->nb[3] / 4),
|
||||
dst_d + i3 * ( dst->nb[3] / 4),
|
||||
src0->ne[0], src0->ne[1], src0->ne[2],
|
||||
dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
|
||||
}
|
||||
} else {
|
||||
const size_t size0 = ggml_nbytes(src0);
|
||||
const size_t size1 = ggml_nbytes(src1);
|
||||
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream));
|
||||
CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -144,6 +144,10 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
// 16-bit float
|
||||
// on Arm, we use __fp16
|
||||
// on x86, we use uint16_t
|
||||
|
|
71
ggml-metal.m
71
ggml-metal.m
|
@ -35,6 +35,10 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_DIV,
|
||||
GGML_METAL_KERNEL_TYPE_DIV_ROW,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_F16,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I32,
|
||||
GGML_METAL_KERNEL_TYPE_REPEAT_I16,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE,
|
||||
GGML_METAL_KERNEL_TYPE_SCALE_4,
|
||||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||
|
@ -184,9 +188,9 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128,
|
||||
GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256,
|
||||
//GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, // https://github.com/ggerganov/llama.cpp/issues/7261
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
|
||||
GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
|
||||
|
@ -485,6 +489,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F32, repeat_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_F16, repeat_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I32, repeat_i32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_REPEAT_I16, repeat_i16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||
|
@ -634,9 +642,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, ctx->support_simdgroup_mm);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
//GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, ctx->support_simdgroup_reduction);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true);
|
||||
|
@ -746,6 +754,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
case GGML_OP_ACC:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_CLAMP:
|
||||
case GGML_OP_SQR:
|
||||
|
@ -770,6 +779,9 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
if (op->src[0]->ne[0] == 256) {
|
||||
return false;
|
||||
}
|
||||
return ctx->support_simdgroup_mm; // TODO: over-restricted for vec-kernels
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
|
@ -976,10 +988,10 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
switch (dst->op) {
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
const int64_t nb = ne00;
|
||||
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
|
||||
|
||||
const int32_t dim = ((int32_t *) dst->op_params)[0];
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
|
@ -1008,7 +1020,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
[encoder setBytes:&dim length:sizeof(dim) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
|
@ -1018,11 +1030,14 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||
|
||||
const size_t offs = 0;
|
||||
|
||||
bool bcast_row = false;
|
||||
|
||||
int64_t nb = ne00;
|
||||
int64_t nb = ne00; // used by the "row" kernels
|
||||
|
||||
id<MTLComputePipelineState> pipeline = nil;
|
||||
|
||||
|
@ -1091,6 +1106,42 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_REPEAT:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline;
|
||||
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F32].pipeline; break;
|
||||
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
|
||||
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
|
||||
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
|
||||
default: GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:10];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:11];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:12];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:13];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:14];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:16];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||
|
||||
const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ACC:
|
||||
{
|
||||
GGML_ASSERT(src0t == GGML_TYPE_F32);
|
||||
|
@ -2573,7 +2624,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break;
|
||||
case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break;
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
|
@ -2586,7 +2637,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
|
||||
switch (ne00) {
|
||||
case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break;
|
||||
case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
//case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break;
|
||||
default:
|
||||
{
|
||||
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
|
||||
|
|
|
@ -168,6 +168,53 @@ kernel void kernel_div(
|
|||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
kernel void kernel_repeat(
|
||||
device const char * src0,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i03 = i3 % ne03;
|
||||
const int64_t i02 = i2 % ne02;
|
||||
const int64_t i01 = i1 % ne01;
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
|
||||
device char * dst_ptr = dst + i3*nb3 + i2*nb2 + i1*nb1 ;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
const int i00 = i0 % ne00;
|
||||
*((device T *)(dst_ptr + i0*nb0)) = *((device T *)(src0_ptr + i00*nb00));
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_repeat<float>) kernel_repeat_t;
|
||||
|
||||
template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat<float>;
|
||||
template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat<half>;
|
||||
template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat<int>;
|
||||
template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat<short>;
|
||||
|
||||
// assumption: src1 is a row
|
||||
// broadcast src1 into src0
|
||||
kernel void kernel_add_row(
|
||||
|
@ -2418,7 +2465,7 @@ template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f
|
|||
template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>;
|
||||
|
||||
template<int64_t D, int64_t Q = 1, int64_t C = 32> // head size, queries per threadgroup, cache items per threadgroup
|
||||
kernel void kernel_flash_attn_ext_vec_f16(
|
||||
|
@ -2696,7 +2743,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
}
|
||||
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>;
|
||||
template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
//template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>;
|
||||
|
||||
kernel void kernel_cpy_f16_f16(
|
||||
device const half * src0,
|
||||
|
@ -3319,31 +3366,30 @@ kernel void kernel_concat(
|
|||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
constant int32_t & dim,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i03 = tgpig.z;
|
||||
const int64_t i02 = tgpig.y;
|
||||
const int64_t i01 = tgpig.x;
|
||||
const int64_t i3 = tgpig.z;
|
||||
const int64_t i2 = tgpig.y;
|
||||
const int64_t i1 = tgpig.x;
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));
|
||||
|
||||
device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + tpitg.x*nb00;
|
||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
||||
device const float * x;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
if (i02 < ne02) {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
||||
src0_ptr += ntg.x*nb00;
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (device const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00);
|
||||
} else {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
||||
src1_ptr += ntg.x*nb10;
|
||||
x = (device const float *)(src1 + (i3 - o[3])*nb13 + (i2 - o[2])*nb12 + (i1 - o[1])*nb11 + (i0 - o[0])*nb10);
|
||||
}
|
||||
dst_ptr += ntg.x*nb0;
|
||||
|
||||
device float * y = (device float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -3813,7 +3813,44 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||
|
||||
// 4-bit -> 8-bit
|
||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||
|
||||
// sub 8
|
||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
// dot product
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
|
@ -5384,7 +5421,32 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(__ARM_NEON)
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||
|
||||
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||
|
||||
// load x
|
||||
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||
|
||||
// load y
|
||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||
|
||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||
}
|
||||
|
||||
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||
#elif defined(__ARM_NEON)
|
||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||
|
||||
|
|
|
@ -8830,12 +8830,11 @@ static void rope(
|
|||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
template<typename T, bool has_pos>
|
||||
template<typename T, bool has_pos, bool has_freq_facs>
|
||||
static void rope_neox(
|
||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
||||
,
|
||||
const sycl::nd_item<3> &item_ct1) {
|
||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims,
|
||||
const float * freq_factors, const sycl::nd_item<3> &item_ct1) {
|
||||
const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
|
||||
item_ct1.get_local_id(1));
|
||||
|
||||
|
@ -8863,8 +8862,10 @@ static void rope_neox(
|
|||
float cur_rot = inv_ndims * ic - ib;
|
||||
|
||||
const int p = has_pos ? pos[i2] : 0;
|
||||
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||
|
||||
const float theta_base =
|
||||
p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
|
||||
p * freq_scale * dpct::pow(theta_scale, col / 2.0f)/freq_factor;
|
||||
|
||||
float cos_theta, sin_theta;
|
||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||
|
@ -12413,7 +12414,7 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|||
const int32_t *pos, float freq_scale,
|
||||
int p_delta_rows, float freq_base, float ext_factor,
|
||||
float attn_factor, rope_corr_dims corr_dims,
|
||||
dpct::queue_ptr stream) {
|
||||
const float * freq_factors, dpct::queue_ptr stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
|
||||
|
@ -12423,38 +12424,48 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
|
|||
const float inv_ndims = -1.0f / n_dims;
|
||||
|
||||
if (pos == nullptr) {
|
||||
/*
|
||||
DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims,
|
||||
item_ct1);
|
||||
});
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, false, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors,
|
||||
item_ct1);
|
||||
});
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
|
||||
the limit. To get the device limit, query
|
||||
info::device::max_work_group_size. Adjust the work-group size if needed.
|
||||
*/
|
||||
dpct::has_capability_or_fail(stream->get_device(),
|
||||
{sycl::aspect::fp16});
|
||||
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, item_ct1);
|
||||
});
|
||||
if (freq_factors == nullptr) {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true, false>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||
});
|
||||
} else {
|
||||
stream->parallel_for(
|
||||
sycl::nd_range<3>(block_nums * block_dims, block_dims),
|
||||
[=](sycl::nd_item<3> item_ct1) {
|
||||
rope_neox<T, true, true>(x, dst, ncols, n_dims, pos, freq_scale,
|
||||
p_delta_rows, ext_factor, attn_factor,
|
||||
corr_dims, theta_scale, inv_ndims, freq_factors, item_ct1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13501,6 +13512,10 @@ inline void ggml_sycl_op_concat(const ggml_tensor *src0,
|
|||
const float *src0_dd, const float *src1_dd,
|
||||
float *dst_dd,
|
||||
const dpct::queue_ptr &main_stream) {
|
||||
#pragma message("TODO: generalize concat kernel for dim != 2")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7563")
|
||||
int dim = dst->op_params[0];
|
||||
GGML_ASSERT(dim != 2);
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
@ -13986,9 +14001,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||
ggml_tensor *dst, const float *src0_dd,
|
||||
const float *src1_dd, float *dst_dd,
|
||||
const dpct::queue_ptr &main_stream) {
|
||||
#pragma message("TODO: implement phi3 frequency factors support")
|
||||
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
||||
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
||||
const ggml_tensor * src2 = dst->src[2];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
|
@ -14014,6 +14027,7 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
||||
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
||||
|
||||
const float * freq_factors = nullptr;
|
||||
const int32_t * pos = nullptr;
|
||||
if ((mode & 1) == 0) {
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_I32);
|
||||
|
@ -14024,6 +14038,16 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||
const bool is_neox = mode & 2;
|
||||
const bool is_glm = mode & 4;
|
||||
|
||||
if (is_neox) {
|
||||
pos = (const int32_t *) src1_dd;
|
||||
|
||||
if (src2 != nullptr) {
|
||||
freq_factors = (const float *) src2->data;
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
|
||||
}
|
||||
|
||||
rope_corr_dims corr_dims;
|
||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
|
||||
|
||||
|
@ -14035,13 +14059,13 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
|
|||
if (src0->type == GGML_TYPE_F32) {
|
||||
rope_neox_sycl(
|
||||
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||
attn_factor, corr_dims, main_stream
|
||||
attn_factor, corr_dims, freq_factors, main_stream
|
||||
);
|
||||
} else if (src0->type == GGML_TYPE_F16) {
|
||||
rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
|
||||
ne00, n_dims, nrows, pos, freq_scale, ne01,
|
||||
freq_base, ext_factor, attn_factor, corr_dims,
|
||||
main_stream);
|
||||
freq_factors, main_stream);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
@ -15243,6 +15267,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
}
|
||||
} else {
|
||||
bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
|
||||
use_mul_mat_q = use_mul_mat_q && (src0->type != GGML_TYPE_IQ2_XXS);
|
||||
|
||||
if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
|
||||
use_mul_mat_q = false;
|
||||
|
|
71
ggml.c
71
ggml.c
|
@ -4882,10 +4882,21 @@ struct ggml_tensor * ggml_repeat_back(
|
|||
// ggml_concat
|
||||
|
||||
struct ggml_tensor * ggml_concat(
|
||||
struct ggml_context* ctx,
|
||||
struct ggml_tensor* a,
|
||||
struct ggml_tensor* b) {
|
||||
GGML_ASSERT(a->ne[0] == b->ne[0] && a->ne[1] == b->ne[1] && a->ne[3] == b->ne[3]);
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int dim) {
|
||||
GGML_ASSERT(dim >= 0 && dim < GGML_MAX_DIMS);
|
||||
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
for (int d = 0; d < GGML_MAX_DIMS; ++d) {
|
||||
if (d == dim) {
|
||||
ne[d] = a->ne[d] + b->ne[d];
|
||||
continue;
|
||||
}
|
||||
GGML_ASSERT(a->ne[d] == b->ne[d]);
|
||||
ne[d] = a->ne[d];
|
||||
}
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -4893,7 +4904,9 @@ struct ggml_tensor * ggml_concat(
|
|||
is_node = true;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, a->ne[0], a->ne[1], a->ne[2] + b->ne[2], a->ne[3]);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, GGML_MAX_DIMS, ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, dim);
|
||||
|
||||
result->op = GGML_OP_CONCAT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
|
@ -5013,6 +5026,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
|||
}
|
||||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
|
||||
|
||||
result->op = GGML_OP_LEAKY_RELU;
|
||||
|
@ -10967,26 +10981,29 @@ static void ggml_compute_forward_concat_f32(
|
|||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
const int32_t dim = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
GGML_ASSERT(dim >= 0 && dim < 4);
|
||||
|
||||
int64_t o[4] = {0, 0, 0, 0};
|
||||
o[dim] = src0->ne[dim];
|
||||
|
||||
const float * x;
|
||||
|
||||
// TODO: smarter multi-theading
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
if (i2 < ne02) { // src0
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
const float * x = (float *)((char *) src0->data + i0 * nb00 + i1 * nb01 + i2 * nb02 + i3 * nb03);
|
||||
|
||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
||||
*y = *x;
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
|
||||
x = (const float *) ((const char *)src0->data + (i0 )*nb00 + (i1 )*nb01 + (i2 )*nb02 + (i3 )*nb03);
|
||||
} else {
|
||||
x = (const float *) ((const char *)src1->data + (i0 - o[0])*nb10 + (i1 - o[1])*nb11 + (i2 - o[2])*nb12 + (i3 - o[3])*nb13);
|
||||
}
|
||||
}
|
||||
} // src1
|
||||
else {
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
const float * x = (float *)((char *) src1->data + i0 * nb10 + i1 * nb11 + (i2 - ne02) * nb12 + i3 * nb13);
|
||||
|
||||
float * y = (float *)((char *)dst->data + i0 * nb0 + i1 * nb1 + i2 * nb2 + i3 * nb3);
|
||||
*y = *x;
|
||||
}
|
||||
float * y = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
|
||||
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10994,7 +11011,7 @@ static void ggml_compute_forward_concat_f32(
|
|||
}
|
||||
|
||||
static void ggml_compute_forward_concat(
|
||||
const struct ggml_compute_params* params,
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor* dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
@ -22742,6 +22759,16 @@ int ggml_cpu_has_neon(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sve(void) {
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
// TODO: Currently, SVE 256 bit is only supported.
|
||||
GGML_ASSERT(svcntb() == QK8_0);
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_arm_fma(void) {
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
return 1;
|
||||
|
|
6
ggml.h
6
ggml.h
|
@ -1007,12 +1007,13 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// concat a and b on dim 2
|
||||
// concat a and b along dim
|
||||
// used in stable-diffusion
|
||||
GGML_API struct ggml_tensor * ggml_concat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b,
|
||||
int dim);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_abs(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -2404,6 +2405,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
|
|
|
@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
|||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
|
||||
from .quants import quant_shape_to_byte_shape
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
@ -251,6 +253,7 @@ class GGUFReader:
|
|||
tensor_names.add(tensor_name)
|
||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||
n_elems = int(np.prod(dims))
|
||||
np_dims = tuple(reversed(dims.tolist()))
|
||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
data_offs = int(start_offs + offset_tensor[0])
|
||||
|
@ -279,6 +282,7 @@ class GGUFReader:
|
|||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||
tensors.append(ReaderTensor(
|
||||
name = tensor_name,
|
||||
tensor_type = ggml_type,
|
||||
|
@ -286,7 +290,7 @@ class GGUFReader:
|
|||
n_elements = n_elems,
|
||||
n_bytes = n_bytes,
|
||||
data_offset = data_offs,
|
||||
data = self._get(data_offs, item_type, item_count),
|
||||
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||
field = field,
|
||||
))
|
||||
self.tensors = tensors
|
||||
|
|
|
@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
|||
import numpy as np
|
||||
|
||||
from .constants import (
|
||||
GGML_QUANT_SIZES,
|
||||
GGUF_DEFAULT_ALIGNMENT,
|
||||
GGUF_MAGIC,
|
||||
GGUF_VERSION,
|
||||
|
@ -26,6 +25,8 @@ from .constants import (
|
|||
TokenType,
|
||||
)
|
||||
|
||||
from .quants import quant_shape_from_byte_shape
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
@ -229,10 +230,7 @@ class GGUFWriter:
|
|||
else:
|
||||
dtype = raw_dtype
|
||||
if tensor_dtype == np.uint8:
|
||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
||||
if tensor_shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
||||
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||
n_dims = len(tensor_shape)
|
||||
self.ti_data += self._pack("I", n_dims)
|
||||
for i in range(n_dims):
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from __future__ import annotations
|
||||
from typing import Callable
|
||||
from typing import Callable, Sequence
|
||||
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
|
@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
|||
import numpy as np
|
||||
|
||||
|
||||
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % block_size != 0:
|
||||
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||
|
||||
|
||||
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||
if shape[-1] % type_size != 0:
|
||||
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||
|
||||
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||
|
|
|
@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
|||
|
||||
for tensor in reader.tensors:
|
||||
total_bytes += tensor.n_bytes
|
||||
# Dimensions are written in reverse order, so flip them first
|
||||
shape = np.flipud(tensor.shape).tolist()
|
||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||
|
||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||
|
||||
|
|
|
@ -4690,6 +4690,9 @@ static void llm_load_vocab(
|
|||
} else if (
|
||||
tokenizer_pre == "dbrx") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||
} else if (
|
||||
tokenizer_pre == "smaug-bpe") {
|
||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
|
@ -12912,6 +12915,7 @@ struct llm_tokenizer_bpe {
|
|||
});
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||
word_collection = unicode_regex_split(text, {
|
||||
// same as llama3
|
||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
|
@ -18262,6 +18266,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
|||
);
|
||||
}
|
||||
|
||||
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
||||
return llama_is_control_token(model->vocab, token);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
|
@ -18738,6 +18746,7 @@ const char * llama_print_system_info(void) {
|
|||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
|
|
12
llama.h
12
llama.h
|
@ -85,6 +85,7 @@ extern "C" {
|
|||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
|
@ -264,6 +265,8 @@ extern "C" {
|
|||
bool check_tensors; // validate model tensor data
|
||||
};
|
||||
|
||||
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
|
||||
// https://github.com/ggerganov/llama.cpp/pull/7544
|
||||
struct llama_context_params {
|
||||
uint32_t seed; // RNG seed, -1 for random
|
||||
uint32_t n_ctx; // text context, 0 = from model
|
||||
|
@ -290,14 +293,14 @@ extern "C" {
|
|||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
enum ggml_type type_k; // data type for K cache
|
||||
enum ggml_type type_v; // data type for V cache
|
||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||
bool embeddings; // if true, extract embeddings (together with logits)
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
bool flash_attn; // whether to use flash attention
|
||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||
|
||||
// Abort callback
|
||||
// if it returns true, execution of llama_decode() will be aborted
|
||||
|
@ -823,6 +826,9 @@ extern "C" {
|
|||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Identify if Token Id is a control token or a render-able token
|
||||
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
|
|
@ -1259,22 +1259,26 @@ struct test_im2col : public test_case {
|
|||
// GGML_OP_CONCAT
|
||||
struct test_concat : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
const int64_t b_ne2;
|
||||
const std::array<int64_t, 4> ne_a;
|
||||
const int64_t ne_b_d;
|
||||
const int dim;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR3(type, ne, b_ne2);
|
||||
return VARS_TO_STR4(type, ne_a, ne_b_d, dim);
|
||||
}
|
||||
|
||||
test_concat(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 10, 10, 10},
|
||||
int64_t b_ne2 = 10)
|
||||
: type(type), ne(ne), b_ne2(b_ne2) {}
|
||||
std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
|
||||
int64_t ne_b_d = 10,
|
||||
int dim = 2)
|
||||
: type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_tensor * b = ggml_new_tensor_4d(ctx, type, ne[0], ne[1], b_ne2, ne[3]);
|
||||
ggml_tensor * out = ggml_concat(ctx, a, b);
|
||||
auto ne_b = ne_a;
|
||||
ne_b[dim] = ne_b_d;
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||
ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
|
||||
ggml_tensor * out = ggml_concat(ctx, a, b, dim);
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
@ -2211,8 +2215,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
}
|
||||
}
|
||||
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32));
|
||||
for (int dim : { 0, 1, 2, 3, }) {
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim));
|
||||
test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim));
|
||||
}
|
||||
|
||||
for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
|
||||
test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue